diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..3924b95e --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-11-27T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.16101v1","updated":"2023-11-27T18:59:42Z","published":"2023-11-27T18:59:42Z","title":"How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for\n Vision LLMs","summary":" This work focuses on the potential of Vision LLMs (VLLMs) in visual\nreasoning. Different from prior studies, we shift our focus from evaluating\nstandard performance to introducing a comprehensive safety evaluation suite,\ncovering both out-of-distribution (OOD) generalization and adversarial\nrobustness. For the OOD evaluation, we present two novel VQA datasets, each\nwith one variant, designed to test model performance under challenging\nconditions. In exploring adversarial robustness, we propose a straightforward\nattack strategy for misleading VLLMs to produce visual-unrelated responses.\nMoreover, we assess the efficacy of two jailbreaking strategies, targeting\neither the vision or language component of VLLMs. Our evaluation of 21 diverse\nmodels, ranging from open-source VLLMs to GPT-4V, yields interesting\nobservations: 1) Current VLLMs struggle with OOD texts but not images, unless\nthe visual information is limited; and 2) These VLLMs can be easily misled by\ndeceiving vision encoders only, and their vision-language training often\ncompromise safety protocols. We release this safety evaluation suite at\nhttps://github.com/UCSC-VLAA/vllm-safety-benchmark.\n","authors":["Haoqin Tu","Chenhang Cui","Zijun Wang","Yiyang Zhou","Bingchen Zhao","Junlin Han","Wangchunshu Zhou","Huaxiu Yao","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2311.16101v1.pdf","comment":"H.T., C.C., and Z.W. contribute equally. Work done during H.T. and\n Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC"},{"id":"http://arxiv.org/abs/2311.16087v1","updated":"2023-11-27T18:56:14Z","published":"2023-11-27T18:56:14Z","title":"DUnE: Dataset for Unified Editing","summary":" Even the most advanced language models remain susceptible to errors\nnecessitating to modify these models without initiating a comprehensive\nretraining process. Model editing refers to the modification of a model's\nknowledge or representations in a manner that produces the desired outcomes.\nPrior research primarily centered around editing factual data e.g. \"Messi plays\nfor Inter Miami\" confining the definition of an edit to a knowledge triplet\ni.e. (subject, object, relation). However, as the applications of language\nmodels expand, so do the diverse ways in which we wish to edit and refine their\noutputs. In this study, we broaden the scope of the editing problem to include\nan array of editing cases such as debiasing and rectifying reasoning errors and\ndefine an edit as any natural language expression that solicits a change in the\nmodel's outputs. We are introducing DUnE-an editing benchmark where edits are\nnatural language sentences and propose that DUnE presents a challenging yet\nrelevant task. To substantiate this claim, we conduct an extensive series of\nexperiments testing various editing approaches to address DUnE, demonstrating\ntheir respective strengths and weaknesses. We show that retrieval-augmented\nlanguage modeling can outperform specialized editing techniques and neither set\nof approaches has fully solved the generalized editing problem covered by our\nbenchmark.\n","authors":["Afra Feyza Akyürek","Eric Pan","Garry Kuwanto","Derry Wijaya"],"pdf_url":"https://arxiv.org/pdf/2311.16087v1.pdf","comment":"Accepted at EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.16083v1","updated":"2023-11-27T18:53:31Z","published":"2023-11-27T18:53:31Z","title":"BERT Goes Off-Topic: Investigating the Domain Transfer Challenge using\n Genre Classification","summary":" While performance of many text classification tasks has been recently\nimproved due to Pre-trained Language Models (PLMs), in this paper we show that\nthey still suffer from a performance gap when the underlying distribution of\ntopics changes. For example, a genre classifier trained on \\textit{political}\ntopics often fails when tested on documents about \\textit{sport} or\n\\textit{medicine}. In this work, we quantify this phenomenon empirically with a\nlarge corpus and a large set of topics. Consequently, we verify that domain\ntransfer remains challenging both for classic PLMs, such as BERT, and for\nmodern large models, such as GPT-3. We also suggest and successfully test a\npossible remedy: after augmenting the training dataset with\ntopically-controlled synthetic texts, the F1 score improves by up to 50\\% for\nsome topics, nearing on-topic training results, while others show little to no\nimprovement. While our empirical results focus on genre classification, our\nmethodology is applicable to other classification tasks such as gender,\nauthorship, or sentiment classification. The code and data to replicate the\nexperiments are available at https://github.com/dminus1/genre\n","authors":["Dmitri Roussinov","Serge Sharoff"],"pdf_url":"https://arxiv.org/pdf/2311.16083v1.pdf","comment":"Published at EMNLP'2023"},{"id":"http://arxiv.org/abs/2311.16079v1","updated":"2023-11-27T18:49:43Z","published":"2023-11-27T18:49:43Z","title":"MEDITRON-70B: Scaling Medical Pretraining for Large Language Models","summary":" Large language models (LLMs) can potentially democratize access to medical\nknowledge. While many efforts have been made to harness and improve LLMs'\nmedical knowledge and reasoning capacities, the resulting models are either\nclosed-source (e.g., PaLM, GPT-4) or limited in scale (<= 13B parameters),\nwhich restricts their abilities. In this work, we improve access to large-scale\nmedical LLMs by releasing MEDITRON: a suite of open-source LLMs with 7B and 70B\nparameters adapted to the medical domain. MEDITRON builds on Llama-2 (through\nour adaptation of Nvidia's Megatron-LM distributed trainer), and extends\npretraining on a comprehensively curated medical corpus, including selected\nPubMed articles, abstracts, and internationally-recognized medical guidelines.\nEvaluations using four major medical benchmarks show significant performance\ngains over several state-of-the-art baselines before and after task-specific\nfinetuning. Overall, MEDITRON achieves a 6% absolute performance gain over the\nbest public baseline in its parameter class and 3% over the strongest baseline\nwe finetuned from Llama-2. Compared to closed-source LLMs, MEDITRON-70B\noutperforms GPT-3.5 and Med-PaLM and is within 5% of GPT-4 and 10% of\nMed-PaLM-2. We release our code for curating the medical pretraining corpus and\nthe MEDITRON model weights to drive open-source development of more capable\nmedical LLMs.\n","authors":["Zeming Chen","Alejandro Hernández Cano","Angelika Romanou","Antoine Bonnet","Kyle Matoba","Francesco Salvi","Matteo Pagliardini","Simin Fan","Andreas Köpf","Amirkeivan Mohtashami","Alexandre Sallinen","Alireza Sakhaeirad","Vinitra Swamy","Igor Krawczuk","Deniz Bayazit","Axel Marmet","Syrielle Montariol","Mary-Anne Hartley","Martin Jaggi","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2311.16079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16075v1","updated":"2023-11-27T18:46:17Z","published":"2023-11-27T18:46:17Z","title":"BioLORD-2023: Semantic Textual Representations Fusing LLM and Clinical\n Knowledge Graph Insights","summary":" In this study, we investigate the potential of Large Language Models to\ncomplement biomedical knowledge graphs in the training of semantic models for\nthe biomedical and clinical domains. Drawing on the wealth of the UMLS\nknowledge graph and harnessing cutting-edge Large Language Models, we propose a\nnew state-of-the-art approach for obtaining high-fidelity representations of\nbiomedical concepts and sentences, consisting of three steps: an improved\ncontrastive learning phase, a novel self-distillation phase, and a weight\naveraging phase. Through rigorous evaluations via the extensive BioLORD testing\nsuite and diverse downstream tasks, we demonstrate consistent and substantial\nperformance improvements over the previous state of the art (e.g. +2pts on\nMedSTS, +2.5pts on MedNLI-S, +6.1pts on EHR-Rel-B). Besides our new\nstate-of-the-art biomedical model for English, we also distill and release a\nmultilingual model compatible with 50+ languages and finetuned on 7 European\nlanguages. Many clinical pipelines can benefit from our latest models. Our new\nmultilingual model enables a range of languages to benefit from our\nadvancements in biomedical semantic representation learning, opening a new\navenue for bioinformatics researchers around the world. As a result, we hope to\nsee BioLORD-2023 becoming a precious tool for future biomedical applications.\n","authors":["François Remy","Kris Demuynck","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2311.16075v1.pdf","comment":"Preprint of upcoming journal article"},{"id":"http://arxiv.org/abs/2107.10021v2","updated":"2023-11-27T18:09:19Z","published":"2021-07-21T11:31:57Z","title":"Neuradicon: operational representation learning of neuroimaging reports","summary":" Radiological reports typically summarize the content and interpretation of\nimaging studies in unstructured form that precludes quantitative analysis. This\nlimits the monitoring of radiological services to throughput undifferentiated\nby content, impeding specific, targeted operational optimization. Here we\npresent Neuradicon, a natural language processing (NLP) framework for\nquantitative analysis of neuroradiological reports. Our framework is a hybrid\nof rule-based and artificial intelligence models to represent neurological\nreports in succinct, quantitative form optimally suited to operational\nguidance. We demonstrate the application of Neuradicon to operational\nphenotyping of a corpus of 336,569 reports, and report excellent\ngeneralizability across time and two independent healthcare institutions.\n","authors":["Henry Watkins","Robert Gray","Adam Julius","Yee-Haur Mah","Walter H. L. Pinaya","Paul Wright","Ashwani Jha","Holger Engleitner","Jorge Cardoso","Sebastien Ourselin","Geraint Rees","Rolf Jaeger","Parashkev Nachev"],"pdf_url":"https://arxiv.org/pdf/2107.10021v2.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.14306v3","updated":"2023-11-27T17:43:20Z","published":"2023-08-28T04:57:07Z","title":"Evaluating the Robustness to Instructions of Large Language Models","summary":" Recently, Instruction fine-tuning has risen to prominence as a potential\nmethod for enhancing the zero-shot capabilities of Large Language Models (LLMs)\non novel tasks. This technique has shown an exceptional ability to boost the\nperformance of moderately sized LLMs, sometimes even reaching performance\nlevels comparable to those of much larger model variants. The focus is on the\nrobustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an\nexploration of six models including Alpaca, Vicuna, WizardLM, and Traditional\nTask-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction\ndatasets as case studies. We carried out a comprehensive evaluation of these\ninstruction-following LLMs which have been tuned based on open-domain\ninstructions and task-oriented instructions. The main discussion is their\nperformance and robustness towards instructions. We have observed that in most\ncases, the model's performance in dealing with unfamiliar instructions tends to\nworsen significantly, and the robustness of the model for RE instructions\ndeteriorates compared to QA. Further, we discovered that up until a certain\nparameter size threshold (3B), the performance of the FLAN-T5 model improves as\nthe parameter count increases. The robustness of different scales of FLAN-T5\nmodels to RE instruction is worse than the robustness to QA instruction.\n","authors":["Yuansheng Ni","Sichao Jiang","Xinyu wu","Hui Shen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.14306v3.pdf","comment":"There were major problems with the experimental data"},{"id":"http://arxiv.org/abs/2310.06627v2","updated":"2023-11-27T16:59:39Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40\\% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14353v2","updated":"2023-11-27T16:55:29Z","published":"2023-11-24T08:53:52Z","title":"Average Token Delay: A Duration-aware Latency Metric for Simultaneous\n Translation","summary":" Simultaneous translation is a task in which the translation begins before the\nend of an input speech segment. Its evaluation should be conducted based on\nlatency in addition to quality, and for users, the smallest possible amount of\nlatency is preferable. Most existing metrics measure latency based on the start\ntimings of partial translations and ignore their duration. This means such\nmetrics do not penalize the latency caused by long translation output, which\ndelays the comprehension of users and subsequent translations. In this work, we\npropose a novel latency evaluation metric for simultaneous translation called\n\\emph{Average Token Delay} (ATD) that focuses on the duration of partial\ntranslations. We demonstrate its effectiveness through analyses simulating\nuser-side latency based on Ear-Voice Span (EVS). In our experiment, ATD had the\nhighest correlation with EVS among baseline latency metrics under most\nconditions.\n","authors":["Yasumasa Kano","Katsuhito Sudoh","Satoshi Nakamura"],"pdf_url":"https://arxiv.org/pdf/2311.14353v2.pdf","comment":"Extended version of the paper (doi: 10.21437/Interspeech.2023-933)\n which appeared in INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2311.15983v1","updated":"2023-11-27T16:28:20Z","published":"2023-11-27T16:28:20Z","title":"Sparsify-then-Classify: From Internal Neurons of Large Language Models\n To Efficient Text Classifiers","summary":" Among the many tasks that Large Language Models (LLMs) have revolutionized is\ntext classification. However, existing approaches for applying pretrained LLMs\nto text classification predominantly rely on using single token outputs from\nonly the last layer of hidden states. As a result, they suffer from limitations\nin efficiency, task-specificity, and interpretability. In our work, we\ncontribute an approach that uses all internal representations by employing\nmultiple pooling strategies on all activation and hidden states. Our novel\nlightweight strategy, Sparsify-then-Classify (STC) first sparsifies\ntask-specific features layer-by-layer, then aggregates across layers for text\nclassification. STC can be applied as a seamless plug-and-play module on top of\nexisting LLMs. Our experiments on a comprehensive set of models and datasets\ndemonstrate that STC not only consistently improves the classification\nperformance of pretrained and fine-tuned models, but is also more efficient for\nboth training and inference, and is more intrinsically interpretable.\n","authors":["Yilun Liu","Difan Jiao","Ashton Anderson"],"pdf_url":"https://arxiv.org/pdf/2311.15983v1.pdf","comment":"23 pages, 5 figures, 8 tables Code available at\n https://github.com/difanj0713/Sparsify-then-Classify"},{"id":"http://arxiv.org/abs/2311.15964v1","updated":"2023-11-27T16:07:37Z","published":"2023-11-27T16:07:37Z","title":"Efficient Pre-training for Localized Instruction Generation of Videos","summary":" Procedural videos show step-by-step demonstrations of tasks like recipe\npreparation. Understanding such videos is challenging, involving the precise\nlocalization of steps and the generation of textual instructions. Manually\nannotating steps and writing instructions is costly, which limits the size of\ncurrent datasets and hinders effective learning. Leveraging large but noisy\nvideo-transcript datasets for pre-training can boost performance, but demands\nsignificant computational resources. Furthermore, transcripts contain\nirrelevant content and exhibit style variation compared to instructions written\nby human annotators. To mitigate both issues, we propose a technique,\nSieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters\nirrelevant transcripts and (ii) Swap enhances the quality of the text\ninstruction by automatically replacing the transcripts with human-written\ninstructions from a text-only recipe dataset. The curated dataset, three orders\nof magnitude smaller than current web-scale datasets, enables efficient\ntraining of large-scale models with competitive performance. We complement our\nSieve-\\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step\nlocalization and instruction generation for procedural videos. When this model\nis pre-trained on our curated dataset, it achieves state-of-the-art performance\nin zero-shot and finetuning settings on YouCook2 and Tasty, while using a\nfraction of the computational resources.\n","authors":["Anil Batra","Davide Moltisanti","Laura Sevilla-Lara","Marcus Rohrbach","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2311.15964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15954v1","updated":"2023-11-27T15:58:28Z","published":"2023-11-27T15:58:28Z","title":"A Quantitative Approach to Understand Self-Supervised Models as\n Cross-lingual Feature Extractors","summary":" In this work, we study the features extracted by English self-supervised\nlearning (SSL) models in cross-lingual contexts and propose a new metric to\npredict the quality of feature representations. Using automatic speech\nrecognition (ASR) as a downstream task, we analyze the effect of model size,\ntraining objectives, and model architecture on the models' performance as a\nfeature extractor for a set of topologically diverse corpora. We develop a\nnovel metric, the Phonetic-Syntax Ratio (PSR), to measure the phonetic and\nsynthetic information in the extracted representations using deep generalized\ncanonical correlation analysis. Results show the contrastive loss in the\nwav2vec2.0 objective facilitates more effective cross-lingual feature\nextraction. There is a positive correlation between PSR scores and ASR\nperformance, suggesting that phonetic information extracted by monolingual SSL\nmodels can be used for downstream tasks in cross-lingual settings. The proposed\nmetric is an effective indicator of the quality of the representations and can\nbe useful for model selection.\n","authors":["Shuyue Stella Li","Beining Xu","Xiangyu Zhang","Hexin Liu","Wenhan Chao","Leibny Paola Garcia"],"pdf_url":"https://arxiv.org/pdf/2311.15954v1.pdf","comment":"12 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.15946v1","updated":"2023-11-27T15:53:11Z","published":"2023-11-27T15:53:11Z","title":"Leveraging deep active learning to identify low-resource mobility\n functioning information in public clinical notes","summary":" Function is increasingly recognized as an important indicator of whole-person\nhealth, although it receives little attention in clinical natural language\nprocessing research. We introduce the first public annotated dataset\nspecifically on the Mobility domain of the International Classification of\nFunctioning, Disability and Health (ICF), aiming to facilitate automatic\nextraction and analysis of functioning information from free-text clinical\nnotes. We utilize the National NLP Clinical Challenges (n2c2) research dataset\nto construct a pool of candidate sentences using keyword expansion. Our active\nlearning approach, using query-by-committee sampling weighted by density\nrepresentativeness, selects informative sentences for human annotation. We\ntrain BERT and CRF models, and use predictions from these models to guide the\nselection of new sentences for subsequent annotation iterations. Our final\ndataset consists of 4,265 sentences with a total of 11,784 entities, including\n5,511 Action entities, 5,328 Mobility entities, 306 Assistance entities, and\n639 Quantification entities. The inter-annotator agreement (IAA), averaged over\nall entity types, is 0.72 for exact matching and 0.91 for partial matching. We\nalso train and evaluate common BERT models and state-of-the-art Nested NER\nmodels. The best F1 scores are 0.84 for Action, 0.7 for Mobility, 0.62 for\nAssistance, and 0.71 for Quantification. Empirical results demonstrate\npromising potential of NER models to accurately extract mobility functioning\ninformation from clinical text. The public availability of our annotated\ndataset will facilitate further research to comprehensively capture functioning\ninformation in electronic health records (EHRs).\n","authors":["Tuan-Dung Le","Zhuqi Miao","Samuel Alvarado","Brittany Smith","William Paiva","Thanh Thieu"],"pdf_url":"https://arxiv.org/pdf/2311.15946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15941v1","updated":"2023-11-27T15:49:29Z","published":"2023-11-27T15:49:29Z","title":"Tell2Design: A Dataset for Language-Guided Floor Plan Generation","summary":" We consider the task of generating designs directly from natural language\ndescriptions, and consider floor plan generation as the initial research area.\nLanguage conditional generative models have recently been very successful in\ngenerating high-quality artistic images. However, designs must satisfy\ndifferent constraints that are not present in generating artistic images,\nparticularly spatial and relational constraints. We make multiple contributions\nto initiate research on this task. First, we introduce a novel dataset,\n\\textit{Tell2Design} (T2D), which contains more than $80k$ floor plan designs\nassociated with natural language instructions. Second, we propose a\nSequence-to-Sequence model that can serve as a strong baseline for future\nresearch. Third, we benchmark this task with several text-conditional image\ngeneration models. We conclude by conducting human evaluations on the generated\nsamples and providing an analysis of human performance. We hope our\ncontributions will propel the research on language-guided design generation\nforward.\n","authors":["Sicong Leng","Yang Zhou","Mohammed Haroon Dupty","Wee Sun Lee","Sam Conrad Joyce","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2311.15941v1.pdf","comment":"Paper published in ACL2023; Area Chair Award; Best Paper Nomination"},{"id":"http://arxiv.org/abs/2311.15930v1","updated":"2023-11-27T15:38:17Z","published":"2023-11-27T15:38:17Z","title":"WorldSense: A Synthetic Benchmark for Grounded Reasoning in Large\n Language Models","summary":" We propose WorldSense, a benchmark designed to assess the extent to which\nLLMs are consistently able to sustain tacit world models, by testing how they\ndraw simple inferences from descriptions of simple arrangements of entities.\nWorldsense is a synthetic benchmark with three problem types, each with their\nown trivial control, which explicitly avoids bias by decorrelating the abstract\nstructure of problems from the vocabulary and expressions, and by decorrelating\nall problem subparts with the correct response. We run our benchmark on three\nstate-of-the-art chat-LLMs (GPT3.5, GPT4 and Llama2-chat) and show that these\nmodels make errors even with as few as three objects. Furthermore, they have\nquite heavy response biases, preferring certain responses irrespective of the\nquestion. Errors persist even with chain-of-thought prompting and in-context\nlearning. Lastly, we show that while finetuning on similar problems does result\nin substantial improvements -- within- and out-of-distribution -- the finetuned\nmodels do not generalise beyond a constraint problem space.\n","authors":["Youssef Benchekroun","Megi Dervishi","Mark Ibrahim","Jean-Baptiste Gaya","Xavier Martinet","Grégoire Mialon","Thomas Scialom","Emmanuel Dupoux","Dieuwke Hupkes","Pascal Vincent"],"pdf_url":"https://arxiv.org/pdf/2311.15930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07590v2","updated":"2023-11-27T15:17:49Z","published":"2023-11-09T17:12:44Z","title":"Technical Report: Large Language Models can Strategically Deceive their\n Users when Put Under Pressure","summary":" We demonstrate a situation in which Large Language Models, trained to be\nhelpful, harmless, and honest, can display misaligned behavior and\nstrategically deceive their users about this behavior without being instructed\nto do so. Concretely, we deploy GPT-4 as an agent in a realistic, simulated\nenvironment, where it assumes the role of an autonomous stock trading agent.\nWithin this environment, the model obtains an insider tip about a lucrative\nstock trade and acts upon it despite knowing that insider trading is\ndisapproved of by company management. When reporting to its manager, the model\nconsistently hides the genuine reasons behind its trading decision. We perform\na brief investigation of how this behavior varies under changes to the setting,\nsuch as removing model access to a reasoning scratchpad, attempting to prevent\nthe misaligned behavior by changing system instructions, changing the amount of\npressure the model is under, varying the perceived risk of getting caught, and\nmaking other simple changes to the environment. To our knowledge, this is the\nfirst demonstration of Large Language Models trained to be helpful, harmless,\nand honest, strategically deceiving their users in a realistic situation\nwithout direct instructions or training for deception.\n","authors":["Jérémy Scheurer","Mikita Balesni","Marius Hobbhahn"],"pdf_url":"https://arxiv.org/pdf/2311.07590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13547v3","updated":"2023-11-27T15:10:00Z","published":"2023-05-22T23:43:23Z","title":"Self-Evolution Learning for Mixup: Enhance Data Augmentation on Few-Shot\n Text Classification Tasks","summary":" Text classification tasks often encounter few shot scenarios with limited\nlabeled data, and addressing data scarcity is crucial. Data augmentation with\nmixup has shown to be effective on various text classification tasks. However,\nmost of the mixup methods do not consider the varying degree of learning\ndifficulty in different stages of training and generate new samples with one\nhot labels, resulting in the model over confidence. In this paper, we propose a\nself evolution learning (SE) based mixup approach for data augmentation in text\nclassification, which can generate more adaptive and model friendly pesudo\nsamples for the model training. SE focuses on the variation of the model's\nlearning ability. To alleviate the model confidence, we introduce a novel\ninstance specific label smoothing approach, which linearly interpolates the\nmodel's output and one hot labels of the original samples to generate new soft\nfor label mixing up. Through experimental analysis, in addition to improving\nclassification accuracy, we demonstrate that SE also enhances the model's\ngeneralize ability.\n","authors":["Haoqi Zheng","Qihuang Zhong","Liang Ding","Zhiliang Tian","Xin Niu","Dongsheng Li","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2305.13547v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15896v1","updated":"2023-11-27T15:01:26Z","published":"2023-11-27T15:01:26Z","title":"Data Generation for Post-OCR correction of Cyrillic handwriting","summary":" This paper introduces a novel approach to post-Optical Character Recognition\nCorrection (POC) for handwritten Cyrillic text, addressing a significant gap in\ncurrent research methodologies. This gap is due to the lack of large text\ncorporas that provide OCR errors for further training of language-based POC\nmodels, which are demanding in terms of corpora size. Our study primarily\nfocuses on the development and application of a synthetic handwriting\ngeneration engine based on B\\'ezier curves. Such an engine generates highly\nrealistic handwritten text in any amounts, which we utilize to create a\nsubstantial dataset by transforming Russian text corpora sourced from the\ninternet. We apply a Handwritten Text Recognition (HTR) model to this dataset\nto identify OCR errors, forming the basis for our POC model training. The\ncorrection model is trained on a 90-symbol input context, utilizing a\npre-trained T5 architecture with a seq2seq correction task. We evaluate our\napproach on HWR200 and School_notebooks_RU datasets as they provide significant\nchallenges in the HTR domain. Furthermore, POC can be used to highlight errors\nfor teachers, evaluating student performance. This can be done simply by\ncomparing sentences before and after correction, displaying differences in\ntext. Our primary contribution lies in the innovative use of B\\'ezier curves\nfor Cyrillic text generation and subsequent error correction using a\nspecialized POC model. We validate our approach by presenting Word Accuracy\nRate (WAR) and Character Accuracy Rate (CAR) results, both with and without\npost-OCR correction, using real open corporas of handwritten Cyrillic text.\nThese results, coupled with our methodology, are designed to be reproducible,\npaving the way for further advancements in the field of OCR and handwritten\ntext analysis. Paper contributions can be found in\nhttps://github.com/dbrainio/CyrillicHandwritingPOC\n","authors":["Evgenii Davydkin","Aleksandr Markelov","Egor Iuldashev","Anton Dudkin","Ivan Krivorotov"],"pdf_url":"https://arxiv.org/pdf/2311.15896v1.pdf","comment":"17 pages, 27 figures, 6 tables, 26 references"},{"id":"http://arxiv.org/abs/2307.15176v2","updated":"2023-11-27T14:35:05Z","published":"2023-07-27T20:11:07Z","title":"RCT Rejection Sampling for Causal Estimation Evaluation","summary":" Confounding is a significant obstacle to unbiased estimation of causal\neffects from observational data. For settings with high-dimensional covariates\n-- such as text data, genomics, or the behavioral social sciences --\nresearchers have proposed methods to adjust for confounding by adapting machine\nlearning methods to the goal of causal estimation. However, empirical\nevaluation of these adjustment methods has been challenging and limited. In\nthis work, we build on a promising empirical evaluation strategy that\nsimplifies evaluation design and uses real data: subsampling randomized\ncontrolled trials (RCTs) to create confounded observational datasets while\nusing the average causal effects from the RCTs as ground-truth. We contribute a\nnew sampling algorithm, which we call RCT rejection sampling, and provide\ntheoretical guarantees that causal identification holds in the observational\ndata to allow for valid comparisons to the ground-truth RCT. Using synthetic\ndata, we show our algorithm indeed results in low bias when oracle estimators\nare evaluated on the confounded samples, which is not always the case for a\npreviously proposed algorithm. In addition to this identification result, we\nhighlight several finite data considerations for evaluation designers who plan\nto use RCT rejection sampling on their own datasets. As a proof of concept, we\nimplement an example evaluation pipeline and walk through these finite data\nconsiderations with a novel, real-world RCT -- which we release publicly --\nconsisting of approximately 70k observations and text data as high-dimensional\ncovariates. Together, these contributions build towards a broader agenda of\nimproved empirical evaluation for causal estimation.\n","authors":["Katherine A. Keith","Sergey Feldman","David Jurgens","Jonathan Bragg","Rohit Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.15176v2.pdf","comment":"Code and data at https://github.com/kakeith/rct_rejection_sampling"},{"id":"http://arxiv.org/abs/2310.14505v2","updated":"2023-11-27T14:23:16Z","published":"2023-10-23T02:32:30Z","title":"Sentiment analysis with adaptive multi-head attention in Transformer","summary":" We propose a novel framework based on the attention mechanism to identify the\nsentiment of a movie review document. Previous efforts on deep neural networks\nwith attention mechanisms focus on encoder and decoder with fixed numbers of\nmulti-head attention. Therefore, we need a mechanism to stop the attention\nprocess automatically if no more useful information can be read from the\nmemory.In this paper, we propose an adaptive multi-head attention architecture\n(AdaptAttn) which varies the number of attention heads based on length of\nsentences. AdaptAttn has a data preprocessing step where each document is\nclassified into any one of the three bins small, medium or large based on\nlength of the sentence. The document classified as small goes through two heads\nin each layer, the medium group passes four heads and the large group is\nprocessed by eight heads. We examine the merit of our model on the Stanford\nlarge movie review dataset. The experimental results show that the F1 score\nfrom our model is on par with the baseline model.\n","authors":["Fanfei Meng","David Demeter"],"pdf_url":"https://arxiv.org/pdf/2310.14505v2.pdf","comment":"Accepted by the 4th International Conference on Signal Processing and\n Machine Learning"},{"id":"http://arxiv.org/abs/2310.19106v3","updated":"2023-11-27T13:46:00Z","published":"2023-10-29T18:43:19Z","title":"PACuna: Automated Fine-Tuning of Language Models for Particle\n Accelerators","summary":" Navigating the landscape of particle accelerators has become increasingly\nchallenging with recent surges in contributions. These intricate devices\nchallenge comprehension, even within individual facilities. To address this, we\nintroduce PACuna, a fine-tuned language model refined through publicly\navailable accelerator resources like conferences, pre-prints, and books. We\nautomated data collection and question generation to minimize expert\ninvolvement and make the data publicly available. PACuna demonstrates\nproficiency in addressing intricate accelerator questions, validated by\nexperts. Our approach shows adapting language models to scientific domains by\nfine-tuning technical texts and auto-generated corpora capturing the latest\ndevelopments can further produce pre-trained models to answer some intricate\nquestions that commercially available assistants cannot and can serve as\nintelligent assistants for individual facilities.\n","authors":["Antonin Sulc","Raimund Kammering","Annika Eichler","Tim Wilksen"],"pdf_url":"https://arxiv.org/pdf/2310.19106v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15786v1","updated":"2023-11-27T13:01:59Z","published":"2023-11-27T13:01:59Z","title":"YUAN 2.0: A Large Language Model with Localized Filtering-based\n Attention","summary":" In this work, the Localized Filtering-based Attention (LFA) is introduced to\nincorporate prior knowledge of local dependencies of natural language into\nAttention. Based on LFA, we develop and release Yuan 2.0, a large language\nmodel with parameters ranging from 2.1 billion to 102.6 billion. A data\nfiltering and generation method is presented to build pretraining and\nfine-tuning dataset in high quality. A distributed training method with\nnon-uniform pipeline parallel, data parallel, and optimizer parallel is\nproposed, which greatly reduces the bandwidth requirements of intra-node\ncommunication, and achieves good performance in large-scale distributed\ntraining. Yuan 2.0 models display impressive ability in code generation, math\nproblem-solving, and chat compared with existing models. The latest version of\nYUAN 2.0, including model weights and source code, is accessible at Github.\n","authors":["Shaohua Wu","Xudong Zhao","Shenling Wang","Jiangang Luo","Lingjun Li","Xi Chen","Bing Zhao","Wei Wang","Tong Yu","Rongguo Zhang","Jiahua Zhang","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15781v1","updated":"2023-11-27T12:54:47Z","published":"2023-11-27T12:54:47Z","title":"Increasing Coverage and Precision of Textual Information in Multilingual\n Knowledge Graphs","summary":" Recent work in Natural Language Processing and Computer Vision has been using\ntextual information -- e.g., entity names and descriptions -- available in\nknowledge graphs to ground neural models to high-quality structured data.\nHowever, when it comes to non-English languages, the quantity and quality of\ntextual information are comparatively scarce. To address this issue, we\nintroduce the novel task of automatic Knowledge Graph Enhancement (KGE) and\nperform a thorough investigation on bridging the gap in both the quantity and\nquality of textual information between English and non-English languages. More\nspecifically, we: i) bring to light the problem of increasing multilingual\ncoverage and precision of entity names and descriptions in Wikidata; ii)\ndemonstrate that state-of-the-art methods, namely, Machine Translation (MT),\nWeb Search (WS), and Large Language Models (LLMs), struggle with this task;\niii) present M-NTA, a novel unsupervised approach that combines MT, WS, and\nLLMs to generate high-quality textual information; and, iv) study the impact of\nincreasing multilingual coverage and precision of non-English textual\ninformation in Entity Linking, Knowledge Graph Completion, and Question\nAnswering. As part of our effort towards better multilingual knowledge graphs,\nwe also introduce WikiKGE-10, the first human-curated benchmark to evaluate KGE\napproaches in 10 languages across 7 language families.\n","authors":["Simone Conia","Min Li","Daniel Lee","Umar Farooq Minhas","Ihab Ilyas","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2311.15781v1.pdf","comment":"Camera ready for EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.15766v1","updated":"2023-11-27T12:37:51Z","published":"2023-11-27T12:37:51Z","title":"Knowledge Unlearning for LLMs: Tasks, Methods, and Challenges","summary":" In recent years, large language models (LLMs) have spurred a new research\nparadigm in natural language processing. Despite their excellent capability in\nknowledge-based question answering and reasoning, their potential to retain\nfaulty or even harmful knowledge poses risks of malicious application. The\nchallenge of mitigating this issue and transforming these models into purer\nassistants is crucial for their widespread applicability. Unfortunately,\nRetraining LLMs repeatedly to eliminate undesirable knowledge is impractical\ndue to their immense parameters. Knowledge unlearning, derived from analogous\nstudies on machine unlearning, presents a promising avenue to address this\nconcern and is notably advantageous in the context of LLMs. It allows for the\nremoval of harmful knowledge in an efficient manner, without affecting\nunrelated knowledge in the model. To this end, we provide a survey of knowledge\nunlearning in the era of LLMs. Firstly, we formally define the knowledge\nunlearning problem and distinguish it from related works. Subsequently, we\ncategorize existing knowledge unlearning methods into three classes: those\nbased on parameter optimization, parameter merging, and in-context learning,\nand introduce details of these unlearning methods. We further present\nevaluation datasets used in existing methods, and finally conclude this survey\nby presenting the ongoing challenges and future directions.\n","authors":["Nianwen Si","Hao Zhang","Heyu Chang","Wenlin Zhang","Dan Qu","Weiqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15766v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2311.15759v1","updated":"2023-11-27T12:29:20Z","published":"2023-11-27T12:29:20Z","title":"Towards Vision Enhancing LLMs: Empowering Multimodal Knowledge Storage\n and Sharing in LLMs","summary":" Recent advancements in multimodal large language models (MLLMs) have achieved\nsignificant multimodal generation capabilities, akin to GPT-4. These models\npredominantly map visual information into language representation space,\nleveraging the vast knowledge and powerful text generation abilities of LLMs to\nproduce multimodal instruction-following responses. We could term this method\nas LLMs for Vision because of its employing LLMs for visual-language\nunderstanding, yet observe that these MLLMs neglect the potential of harnessing\nvisual knowledge to enhance overall capabilities of LLMs, which could be\nregraded as Vision Enhancing LLMs. In this paper, we propose an approach called\nMKS2, aimed at enhancing LLMs through empowering Multimodal Knowledge Storage\nand Sharing in LLMs. Specifically, we introduce the Modular Visual Memory, a\ncomponent integrated into the internal blocks of LLMs, designed to store\nopen-world visual information efficiently. Additionally, we present a soft\nMixtures-of-Multimodal Experts architecture in LLMs to invoke multimodal\nknowledge collaboration during generation. Our comprehensive experiments\ndemonstrate that MKS2 substantially augments the reasoning capabilities of LLMs\nin contexts necessitating physical or commonsense knowledge. It also delivers\ncompetitive results on multimodal benchmarks.\n","authors":["Yunxin Li","Baotian Hu","Wei Wang","Xiaochun Cao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15759v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.15723v1","updated":"2023-11-27T11:17:29Z","published":"2023-11-27T11:17:29Z","title":"Italian Crossword Generator: Enhancing Education through Interactive\n Word Puzzles","summary":" Educational crosswords offer numerous benefits for students, including\nincreased engagement, improved understanding, critical thinking, and memory\nretention. Creating high-quality educational crosswords can be challenging, but\nrecent advances in natural language processing and machine learning have made\nit possible to use language models to generate nice wordplays. The exploitation\nof cutting-edge language models like GPT3-DaVinci, GPT3-Curie, GPT3-Babbage,\nGPT3-Ada, and BERT-uncased has led to the development of a comprehensive system\nfor generating and verifying crossword clues. A large dataset of clue-answer\npairs was compiled to fine-tune the models in a supervised manner to generate\noriginal and challenging clues from a given keyword. On the other hand, for\ngenerating crossword clues from a given text, Zero/Few-shot learning techniques\nwere used to extract clues from the input text, adding variety and creativity\nto the puzzles. We employed the fine-tuned model to generate data and labeled\nthe acceptability of clue-answer parts with human supervision. To ensure\nquality, we developed a classifier by fine-tuning existing language models on\nthe labeled dataset. Conversely, to assess the quality of clues generated from\nthe given text using zero/few-shot learning, we employed a zero-shot learning\napproach to check the quality of generated clues. The results of the evaluation\nhave been very promising, demonstrating the effectiveness of the approach in\ncreating high-standard educational crosswords that offer students engaging and\nrewarding learning experiences.\n","authors":["Kamyar Zeinalipour","Tommaso laquinta","Asya Zanollo","Giovanni Angelini","Leonardo Rigutini","Marco Maggini","Marco Gori"],"pdf_url":"https://arxiv.org/pdf/2311.15723v1.pdf","comment":"Accepted Paper for CLiC-it 2023 - 9th Italian Conference on\n Computational Linguistics"},{"id":"http://arxiv.org/abs/2311.15716v1","updated":"2023-11-27T10:59:16Z","published":"2023-11-27T10:59:16Z","title":"Justifiable Artificial Intelligence: Engineering Large Language Models\n for Legal Applications","summary":" In this work, I discuss how Large Language Models can be applied in the legal\ndomain, circumventing their current drawbacks. Despite their large success and\nacceptance, their lack of explainability hinders legal experts to trust in\ntheir output, and this happens rightfully so. However, in this paper, I argue\nin favor of a new view, Justifiable Artificial Intelligence, instead of\nfocusing on Explainable Artificial Intelligence. I discuss in this paper how\ngaining evidence for and against a Large Language Model's output may make their\ngenerated texts more trustworthy - or hold them accountable for misinformation.\n","authors":["Sabine Wehnert"],"pdf_url":"https://arxiv.org/pdf/2311.15716v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.01825v2","updated":"2023-11-27T10:39:13Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":" Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15698v1","updated":"2023-11-27T10:34:55Z","published":"2023-11-27T10:34:55Z","title":"Cerbero-7B: A Leap Forward in Language-Specific LLMs Through Enhanced\n Chat Corpus Generation and Evaluation","summary":" This study introduces a novel approach for generating high-quality,\nlanguage-specific chat corpora using a self-chat mechanism. We combine a\ngenerator LLM for creating new samples and an embedder LLM to ensure diversity.\nA new Masked Language Modelling (MLM) model-based quality assessment metric is\nproposed for evaluating and filtering the corpora. Utilizing the llama2-70b as\nthe generator and a multilingual sentence transformer as embedder, we generate\nan Italian chat corpus and refine the Fauno corpus, which is based on\ntranslated English ChatGPT self-chat data. The refinement uses structural\nassertions and Natural Language Processing techniques. Both corpora undergo a\ncomprehensive quality evaluation using the proposed MLM model-based quality\nmetric. The Italian LLM fine-tuned with these corpora demonstrates\nsignificantly enhanced language comprehension and question-answering skills.\nThe resultant model, cerbero-7b, establishes a new state-of-the-art for Italian\nLLMs. This approach marks a substantial advancement in the development of\nlanguage-specific LLMs, with a special emphasis on augmenting corpora for\nunderrepresented languages like Italian.\n","authors":["Federico A. Galatolo","Mario G. C. A. Cimino"],"pdf_url":"https://arxiv.org/pdf/2311.15698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06422v2","updated":"2023-11-27T10:18:36Z","published":"2023-10-10T08:46:10Z","title":"Large Language Models for Propaganda Detection","summary":" The prevalence of propaganda in our digital society poses a challenge to\nsocietal harmony and the dissemination of truth. Detecting propaganda through\nNLP in text is challenging due to subtle manipulation techniques and contextual\ndependencies. To address this issue, we investigate the effectiveness of modern\nLarge Language Models (LLMs) such as GPT-3 and GPT-4 for propaganda detection.\nWe conduct experiments using the SemEval-2020 task 11 dataset, which features\nnews articles labeled with 14 propaganda techniques as a multi-label\nclassification problem. Five variations of GPT-3 and GPT-4 are employed,\nincorporating various prompt engineering and fine-tuning strategies across the\ndifferent models. We evaluate the models' performance by assessing metrics such\nas $F1$ score, $Precision$, and $Recall$, comparing the results with the\ncurrent state-of-the-art approach using RoBERTa. Our findings demonstrate that\nGPT-4 achieves comparable results to the current state-of-the-art. Further,\nthis study analyzes the potential and challenges of LLMs in complex tasks like\npropaganda detection.\n","authors":["Kilian Sprenkamp","Daniel Gordon Jones","Liudmila Zavolokina"],"pdf_url":"https://arxiv.org/pdf/2310.06422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15653v1","updated":"2023-11-27T09:33:13Z","published":"2023-11-27T09:33:13Z","title":"MoDS: Model-oriented Data Selection for Instruction Tuning","summary":" Instruction tuning has become the de facto method to equip large language\nmodels (LLMs) with the ability of following user instructions. Usually,\nhundreds of thousands or millions of instruction-following pairs are employed\nto fine-tune the foundation LLMs. Recently, some studies show that a small\nnumber of high-quality instruction data is enough. However, how to select\nappropriate instruction data for a given LLM is still an open problem. To\naddress this problem, in this paper we present a model-oriented data selection\n(MoDS) approach, which selects instruction data based on a new criteria\nconsidering three aspects: quality, coverage and necessity. First, our approach\nutilizes a quality evaluation model to filter out the high-quality subset from\nthe original instruction dataset, and then designs an algorithm to further\nselect from the high-quality subset a seed instruction dataset with good\ncoverage. The seed dataset is applied to fine-tune the foundation LLM to obtain\nan initial instruction-following LLM. Finally, we develop a necessity\nevaluation model to find out the instruction data which are performed badly in\nthe initial instruction-following LLM and consider them necessary instructions\nto further improve the LLMs. In this way, we can get a small high-quality,\nbroad-coverage and high-necessity subset from the original instruction\ndatasets. Experimental results show that, the model fine-tuned with 4,000\ninstruction pairs selected by our approach could perform better than the model\nfine-tuned with the full original dataset which includes 214k instruction data.\n","authors":["Qianlong Du","Chengqing Zong","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15648v1","updated":"2023-11-27T09:20:12Z","published":"2023-11-27T09:20:12Z","title":"Reinforcement Learning from Diffusion Feedback: Q* for Image Search","summary":" Large vision-language models are steadily gaining personalization\ncapabilities at the cost of fine-tuning or data augmentation. We present two\nmodels for image generation using model-agnostic learning that align semantic\npriors with generative capabilities. RLDF, or Reinforcement Learning from\nDiffusion Feedback, is a singular approach for visual imitation through\nprior-preserving reward function guidance. This employs Q-learning (with\nstandard Q*) for generation and follows a semantic-rewarded trajectory for\nimage search through finite encoding-tailored actions. The second proposed\nmethod, noisy diffusion gradient, is optimization driven. At the root of both\nmethods is a special CFG encoding that we propose for continual semantic\nguidance. Using only a single input image and no text input, RLDF generates\nhigh-quality images over varied domains including retail, sports and\nagriculture showcasing class-consistency and strong visual diversity. Project\nwebsite is available at https://infernolia.github.io/RLDF.\n","authors":["Aboli Marathe"],"pdf_url":"https://arxiv.org/pdf/2311.15648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15642v1","updated":"2023-11-27T09:12:35Z","published":"2023-11-27T09:12:35Z","title":"InfoPattern: Unveiling Information Propagation Patterns in Social Media","summary":" Social media play a significant role in shaping public opinion and\ninfluencing ideological communities through information propagation. Our demo\nInfoPattern centers on the interplay between language and human ideology. The\ndemo (Code: https://github.com/blender-nlp/InfoPattern ) is capable of: (1) red\nteaming to simulate adversary responses from opposite ideology communities; (2)\nstance detection to identify the underlying political sentiments in each\nmessage; (3) information propagation graph discovery to reveal the evolution of\nclaims across various communities over time. (Live Demo:\nhttps://incas.csl.illinois.edu/blender/About )\n","authors":["Chi Han","Jialiang Xu","Manling Li","Hanning Zhang","Tarek Abdelzaher","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2311.15642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17255v3","updated":"2023-11-27T08:57:10Z","published":"2023-09-29T14:03:34Z","title":"Knowledge Graphs for the Life Sciences: Recent Developments, Challenges\n and Opportunities","summary":" The term life sciences refers to the disciplines that study living organisms\nand life processes, and include chemistry, biology, medicine, and a range of\nother related disciplines. Research efforts in life sciences are heavily\ndata-driven, as they produce and consume vast amounts of scientific data, much\nof which is intrinsically relational and graph-structured.\n The volume of data and the complexity of scientific concepts and relations\nreferred to therein promote the application of advanced knowledge-driven\ntechnologies for managing and interpreting data, with the ultimate aim to\nadvance scientific discovery.\n In this survey and position paper, we discuss recent developments and\nadvances in the use of graph-based technologies in life sciences and set out a\nvision for how these technologies will impact these fields into the future. We\nfocus on three broad topics: the construction and management of Knowledge\nGraphs (KGs), the use of KGs and associated technologies in the discovery of\nnew knowledge, and the use of KGs in artificial intelligence applications to\nsupport explanations (explainable AI). We select a few exemplary use cases for\neach topic, discuss the challenges and open research questions within these\ntopics, and conclude with a perspective and outlook that summarizes the\noverarching challenges and their potential solutions as a guide for future\nresearch.\n","authors":["Jiaoyan Chen","Hang Dong","Janna Hastings","Ernesto Jiménez-Ruiz","Vanessa López","Pierre Monnin","Catia Pesquita","Petr Škoda","Valentina Tamma"],"pdf_url":"https://arxiv.org/pdf/2309.17255v3.pdf","comment":"33 pages, 1 figure, accepted for Transactions on Graph Data and\n Knowledge (TGDK)"},{"id":"http://arxiv.org/abs/2311.15626v1","updated":"2023-11-27T08:45:31Z","published":"2023-11-27T08:45:31Z","title":"The WebCrow French Crossword Solver","summary":" Crossword puzzles are one of the most popular word games, played in different\nlanguages all across the world, where riddle style can vary significantly from\none country to another. Automated crossword resolution is challenging, and\ntypical solvers rely on large databases of previously solved crosswords. In\nthis work, we extend WebCrow 2.0, an automatic crossword solver, to French,\nmaking it the first program for crossword solving in the French language. To\ncope with the lack of a large repository of clue-answer crossword data, WebCrow\n2.0 exploits multiple modules, called experts, that retrieve candidate answers\nfrom heterogeneous resources, such as the web, knowledge graphs, and linguistic\nrules. We compared WebCrow's performance against humans in two different\nchallenges. Despite the limited amount of past crosswords, French WebCrow was\ncompetitive, actually outperforming humans in terms of speed and accuracy, thus\nproving its capabilities to generalize to new languages.\n","authors":["Giovanni Angelini","Marco Ernandes","Tommaso laquinta","Caroline Stehlé","Fanny Simões","Kamyar Zeinalipour","Andrea Zugarini","Marco Gori"],"pdf_url":"https://arxiv.org/pdf/2311.15626v1.pdf","comment":"Accepted Paper for EAI Intetain 2023 - 14th EAI International\n Conference on Intelligent Technologies for Interactive Entertainment"},{"id":"http://arxiv.org/abs/2311.15623v1","updated":"2023-11-27T08:38:42Z","published":"2023-11-27T08:38:42Z","title":"Injecting linguistic knowledge into BERT for Dialogue State Tracking","summary":" Dialogue State Tracking (DST) models often employ intricate neural network\narchitectures, necessitating substantial training data, and their inference\nprocesses lack transparency. This paper proposes a method that extracts\nlinguistic knowledge via an unsupervised framework and subsequently utilizes\nthis knowledge to augment BERT's performance and interpretability in DST tasks.\nThe knowledge extraction procedure is computationally economical and does not\nnecessitate annotations or additional training data. The injection of the\nextracted knowledge necessitates the addition of only simple neural modules. We\nemploy the Convex Polytopic Model (CPM) as a feature extraction tool for DST\ntasks and illustrate that the acquired features correlate with the syntactic\nand semantic patterns in the dialogues. This correlation facilitates a\ncomprehensive understanding of the linguistic features influencing the DST\nmodel's decision-making process. We benchmark this framework on various DST\ntasks and observe a notable improvement in accuracy.\n","authors":["Xiaohan Feng","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2311.15623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15992v2","updated":"2023-11-27T08:30:00Z","published":"2023-07-29T14:11:15Z","title":"Towards Codable Watermarking for Injecting Multi-bit Information to LLM","summary":" As large language models (LLMs) generate texts with increasing fluency and\nrealism, there is a growing need to identify the source of texts to prevent the\nabuse of LLMs. Text watermarking techniques have proven reliable in\ndistinguishing whether a text is generated by LLMs by injecting hidden patterns\ninto the generated texts. However, we argue that existing watermarking methods\nfor LLMs are encoding-inefficient (only contain one bit of information -\nwhether it is generated from an LLM or not) and cannot flexibly meet the\ndiverse information encoding needs (such as encoding model version, generation\ntime, user id, etc.) in different LLMs application scenarios. In this work, we\nconduct the first systematic study on the topic of Codable Text Watermarking\nfor LLMs (CTWL) that allows text watermarks to carry more customizable\ninformation. First of all, we study the taxonomy of LLM watermarking technology\nand give a mathematical formulation for CTWL. Additionally, we provide a\ncomprehensive evaluation system for CTWL: (1) watermarking success rate, (2)\nrobustness against various corruptions, (3) coding rate of payload information,\n(4) encoding and decoding efficiency, (5) impacts on the quality of the\ngenerated text. To meet the requirements of these non-Pareto-improving metrics,\nwe devise a CTWL method named Balance-Marking, based on the motivation of\nensuring that available and unavailable vocabularies for encoding information\nhave approximately equivalent probabilities. Compared to the random vocabulary\npartitioning extended from the existing work, a probability-balanced vocabulary\npartition can significantly improve the quality of the generated text.\nExtensive experimental results have shown that our method outperforms a direct\nbaseline under comprehensive evaluation.\n","authors":["Lean Wang","Wenkai Yang","Deli Chen","Hao Zhou","Yankai Lin","Fandong Meng","Jie Zhou","Xu Sun"],"pdf_url":"https://arxiv.org/pdf/2307.15992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15614v1","updated":"2023-11-27T08:23:08Z","published":"2023-11-27T08:23:08Z","title":"FreeAL: Towards Human-Free Active Learning in the Era of Large Language\n Models","summary":" Collecting high-quality labeled data for model training is notoriously\ntime-consuming and labor-intensive for various NLP tasks. While copious\nsolutions, such as active learning for small language models (SLMs) and\nprevalent in-context learning in the era of large language models (LLMs), have\nbeen proposed and alleviate the labeling burden to some extent, their\nperformances are still subject to human intervention. It is still underexplored\nhow to reduce the annotation cost in the LLMs era. To bridge this, we\nrevolutionize traditional active learning and propose an innovative\ncollaborative learning framework FreeAL to interactively distill and filter the\ntask-specific knowledge from LLMs. During collaborative training, an LLM serves\nas an active annotator inculcating its coarse-grained knowledge, while a\ndownstream SLM is incurred as a student to filter out high-quality in-context\nsamples to feedback LLM for the subsequent label refinery. Extensive\nexperiments on eight benchmark datasets demonstrate that FreeAL largely\nenhances the zero-shot performances for both SLM and LLM without any human\nsupervision. The code is available at https://github.com/Justherozen/FreeAL .\n","authors":["Ruixuan Xiao","Yiwen Dong","Junbo Zhao","Runze Wu","Minmin Lin","Gang Chen","Haobo Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15614v1.pdf","comment":"Accepted to EMNLP 2023 (Main conference)"},{"id":"http://arxiv.org/abs/2311.15596v1","updated":"2023-11-27T07:44:25Z","published":"2023-11-27T07:44:25Z","title":"Can Vision-Language Models Think from a First-Person Perspective?","summary":" Vision-language models (VLMs) have recently shown promising results in\ntraditional downstream tasks. Evaluation studies have emerged to assess their\nabilities, with the majority focusing on the third-person perspective, and only\na few addressing specific tasks from the first-person perspective. However, the\ncapability of VLMs to \"think\" from a first-person perspective, a crucial\nattribute for advancing autonomous agents and robotics, remains largely\nunexplored. To bridge this research gap, we introduce EgoThink, a novel visual\nquestion-answering benchmark that encompasses six core capabilities with twelve\ndetailed dimensions. The benchmark is constructed using selected clips from\negocentric videos, with manually annotated question-answer pairs containing\nfirst-person information. To comprehensively assess VLMs, we evaluate eighteen\npopular VLMs on EgoThink. Moreover, given the open-ended format of the answers,\nwe use GPT-4 as the automatic judge to compute single-answer grading.\nExperimental results indicate that although GPT-4V leads in numerous\ndimensions, all evaluated VLMs still possess considerable potential for\nimprovement in first-person perspective tasks. Meanwhile, enlarging the number\nof trainable parameters has the most significant impact on model performance on\nEgoThink. In conclusion, EgoThink serves as a valuable addition to existing\nevaluation benchmarks for VLMs, providing an indispensable resource for future\nresearch in the realm of embodied artificial intelligence and robotics.\n","authors":["Sijie Cheng","Zhicheng Guo","Jingwen Wu","Kechen Fang","Peng Li","Huaping Liu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11509v2","updated":"2023-11-27T06:53:03Z","published":"2023-11-20T03:17:21Z","title":"Token-Level Adversarial Prompt Detection Based on Perplexity Measures\n and Contextual Information","summary":" In recent years, Large Language Models (LLM) have emerged as pivotal tools in\nvarious applications. However, these models are susceptible to adversarial\nprompt attacks, where attackers can carefully curate input strings that lead to\nundesirable outputs. The inherent vulnerability of LLMs stems from their\ninput-output mechanisms, especially when presented with intensely\nout-of-distribution (OOD) inputs. This paper proposes a token-level detection\nmethod to identify adversarial prompts, leveraging the LLM's capability to\npredict the next token's probability. We measure the degree of the model's\nperplexity and incorporate neighboring token information to encourage the\ndetection of contiguous adversarial prompt sequences. As a result, we propose\ntwo methods: one that identifies each token as either being part of an\nadversarial prompt or not, and another that estimates the probability of each\ntoken being part of an adversarial prompt.\n","authors":["Zhengmian Hu","Gang Wu","Saayan Mitra","Ruiyi Zhang","Tong Sun","Heng Huang","Viswanathan Swaminathan"],"pdf_url":"https://arxiv.org/pdf/2311.11509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01889v4","updated":"2023-11-27T06:38:47Z","published":"2023-10-03T08:44:50Z","title":"Ring Attention with Blockwise Transformers for Near-Infinite Context","summary":" Transformers have emerged as the architecture of choice for many\nstate-of-the-art AI models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands imposed by Transformers\nlimit their ability to handle long sequences, thereby posing challenges in\nutilizing videos, actions, and other long-form sequences and modalities in\ncomplex environments. We present a novel approach, Ring Attention with\nBlockwise Transformers (Ring Attention), which leverages blockwise computation\nof self-attention and feedforward to distribute long sequences across multiple\ndevices while fully overlapping the communication of key-value blocks with the\ncomputation of blockwise attention. Our approach enables training and inference\nof sequences that are up to device count times longer than those achievable by\nprior memory-efficient Transformers, without resorting to approximations or\nincurring additional communication and computation overheads. Extensive\nexperiments on language modeling and reinforcement learning tasks demonstrate\nthe effectiveness of our approach in allowing millions of tokens context size\nand improving performance.\n","authors":["Hao Liu","Matei Zaharia","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.01889v4.pdf","comment":"Code: https://github.com/lhao499/llm_large_context"},{"id":"http://arxiv.org/abs/2311.15566v1","updated":"2023-11-27T06:31:17Z","published":"2023-11-27T06:31:17Z","title":"SpotServe: Serving Generative Large Language Models on Preemptible\n Instances","summary":" The high computational and memory requirements of generative large language\nmodels (LLMs) make it challenging to serve them cheaply. This paper aims to\nreduce the monetary cost for serving LLMs by leveraging preemptible GPU\ninstances on modern clouds, which offer accesses to spare GPUs at a much\ncheaper price than regular instances but may be preempted by the cloud at any\ntime. Serving LLMs on preemptible instances requires addressing challenges\ninduced by frequent instance preemptions and the necessity of migrating\ninstances to handle these preemptions.\n This paper presents SpotServe, the first distributed LLM serving system on\npreemptible instances. Several key techniques in SpotServe realize fast and\nreliable serving of generative LLMs on cheap preemptible instances. First,\nSpotServe dynamically adapts the LLM parallelization configuration for dynamic\ninstance availability and fluctuating workload, while balancing the trade-off\namong the overall throughput, inference latency and monetary costs. Second, to\nminimize the cost of migrating instances for dynamic reparallelization, the\ntask of migrating instances is formulated as a bipartite graph matching\nproblem, which uses the Kuhn-Munkres algorithm to identify an optimal migration\nplan that minimizes communications. Finally, to take advantage of the grace\nperiod offered by modern clouds, we introduce stateful inference recovery, a\nnew inference mechanism that commits inference progress at a much finer\ngranularity and allows SpotServe to cheaply resume inference upon preemption.\nWe evaluate on real spot instance preemption traces and various popular LLMs\nand show that SpotServe can reduce the P99 tail latency by 2.4 - 9.1x compared\nwith the best existing LLM serving systems. We also show that SpotServe can\nleverage the price advantage of preemptive instances, saving 54% monetary cost\ncompared with only using on-demand instances.\n","authors":["Xupeng Miao","Chunan Shi","Jiangfei Duan","Xiaoli Xi","Dahua Lin","Bin Cui","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2311.15566v1.pdf","comment":"ASPLOS 2024"},{"id":"http://arxiv.org/abs/2311.15565v1","updated":"2023-11-27T06:26:53Z","published":"2023-11-27T06:26:53Z","title":"Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing\n AI-Generated Text","summary":" My research investigates the use of cutting-edge hybrid deep learning models\nto accurately differentiate between AI-generated text and human writing. I\napplied a robust methodology, utilising a carefully selected dataset comprising\nAI and human texts from various sources, each tagged with instructions.\nAdvanced natural language processing techniques facilitated the analysis of\ntextual features. Combining sophisticated neural networks, the custom model\nenabled it to detect nuanced differences between AI and human content.\n","authors":["Finbarrs Oketunji"],"pdf_url":"https://arxiv.org/pdf/2311.15565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15564v1","updated":"2023-11-27T06:22:57Z","published":"2023-11-27T06:22:57Z","title":"Boot and Switch: Alternating Distillation for Zero-Shot Dense Retrieval","summary":" Neural 'dense' retrieval models are state of the art for many datasets,\nhowever these models often exhibit limited domain transfer ability. Existing\napproaches to adaptation are unwieldy, such as requiring explicit supervision,\ncomplex model architectures, or massive external models. We present\n$\\texttt{ABEL}$, a simple but effective unsupervised method to enhance passage\nretrieval in zero-shot settings. Our technique follows a straightforward loop:\na dense retriever learns from supervision signals provided by a reranker, and\nsubsequently, the reranker is updated based on feedback from the improved\nretriever. By iterating this loop, the two components mutually enhance one\nanother's performance. Experimental results demonstrate that our unsupervised\n$\\texttt{ABEL}$ model outperforms both leading supervised and unsupervised\nretrievers on the BEIR benchmark. Meanwhile, it exhibits strong adaptation\nabilities to tasks and domains that were unseen during training. By either\nfine-tuning $\\texttt{ABEL}$ on labelled data or integrating it with existing\nsupervised dense retrievers, we achieve state-of-the-art\nresults.\\footnote{Source code is available at\n\\url{https://github.com/Fantabulous-J/BootSwitch}.}\n","authors":["Fan Jiang","Qiongkai Xu","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2311.15564v1.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.15563v1","updated":"2023-11-27T06:19:50Z","published":"2023-11-27T06:19:50Z","title":"Noisy Self-Training with Synthetic Queries for Dense Retrieval","summary":" Although existing neural retrieval models reveal promising results when\ntraining data is abundant and the performance keeps improving as training data\nincreases, collecting high-quality annotated data is prohibitively costly. To\nthis end, we introduce a novel noisy self-training framework combined with\nsynthetic queries, showing that neural retrievers can be improved in a\nself-evolution manner with no reliance on any external models. Experimental\nresults show that our method improves consistently over existing methods on\nboth general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval\nbenchmarks. Extra analysis on low-resource settings reveals that our method is\ndata efficient and outperforms competitive baselines, with as little as 30% of\nlabelled training data. Further extending the framework for reranker training\ndemonstrates that the proposed method is general and yields additional gains on\ntasks of diverse domains.\\footnote{Source code is available at\n\\url{https://github.com/Fantabulous-J/Self-Training-DPR}}\n","authors":["Fan Jiang","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2311.15563v1.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.15548v1","updated":"2023-11-27T05:27:13Z","published":"2023-11-27T05:27:13Z","title":"Deficiency of Large Language Models in Finance: An Empirical Examination\n of Hallucination","summary":" The hallucination issue is recognized as a fundamental deficiency of large\nlanguage models (LLMs), especially when applied to fields such as finance,\neducation, and law. Despite the growing concerns, there has been a lack of\nempirical investigation. In this paper, we provide an empirical examination of\nLLMs' hallucination behaviors in financial tasks. First, we empirically\ninvestigate LLM model's ability of explaining financial concepts and\nterminologies. Second, we assess LLM models' capacity of querying historical\nstock prices. Third, to alleviate the hallucination issue, we evaluate the\nefficacy of four practical methods, including few-shot learning, Decoding by\nContrasting Layers (DoLa), the Retrieval Augmentation Generation (RAG) method\nand the prompt-based tool learning method for a function to generate a query\ncommand. Finally, our major finding is that off-the-shelf LLMs experience\nserious hallucination behaviors in financial tasks. Therefore, there is an\nurgent need to call for research efforts in mitigating LLMs' hallucination.\n","authors":["Haoqiang Kang","Xiao-Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15544v1","updated":"2023-11-27T05:20:47Z","published":"2023-11-27T05:20:47Z","title":"The effect of source disclosure on evaluation of AI-generated messages:\n A two-part study","summary":" Advancements in artificial intelligence (AI) over the last decade demonstrate\nthat machines can exhibit communicative behavior and influence how humans\nthink, feel, and behave. In fact, the recent development of ChatGPT has shown\nthat large language models (LLMs) can be leveraged to generate high-quality\ncommunication content at scale and across domains, suggesting that they will be\nincreasingly used in practice. However, many questions remain about how knowing\nthe source of the messages influences recipients' evaluation of and preference\nfor AI-generated messages compared to human-generated messages. This paper\ninvestigated this topic in the context of vaping prevention messaging. In Study\n1, which was pre-registered, we examined the influence of source disclosure on\npeople's evaluation of AI-generated health prevention messages compared to\nhuman-generated messages. We found that source disclosure (i.e., labeling the\nsource of a message as AI vs. human) significantly impacted the evaluation of\nthe messages but did not significantly alter message rankings. In a follow-up\nstudy (Study 2), we examined how the influence of source disclosure may vary by\nthe participants' negative attitudes towards AI. We found a significant\nmoderating effect of negative attitudes towards AI on message evaluation, but\nnot for message selection. However, for those with moderate levels of negative\nattitudes towards AI, source disclosure decreased the preference for\nAI-generated messages. Overall, the results of this series of studies showed a\nslight bias against AI-generated messages once the source was disclosed, adding\nto the emerging area of study that lies at the intersection of AI and\ncommunication.\n","authors":["Sue Lim","Ralf Schmälzle"],"pdf_url":"https://arxiv.org/pdf/2311.15544v1.pdf","comment":"Manuscript currently under review. Paper presented at 109th Annual\n National Communication Association (NCA) Conference, November 16-19, 2023. 10\n pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.01947v2","updated":"2023-11-27T05:03:31Z","published":"2023-09-05T04:47:55Z","title":"TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression\n For On-device ASR Models","summary":" Automatic Speech Recognition (ASR) models need to be optimized for specific\nhardware before they can be deployed on devices. This can be done by tuning the\nmodel's hyperparameters or exploring variations in its architecture.\nRe-training and re-validating models after making these changes can be a\nresource-intensive task. This paper presents TODM (Train Once Deploy Many), a\nnew approach to efficiently train many sizes of hardware-friendly on-device ASR\nmodels with comparable GPU-hours to that of a single training job. TODM\nleverages insights from prior work on Supernet, where Recurrent Neural Network\nTransducer (RNN-T) models share weights within a Supernet. It reduces layer\nsizes and widths of the Supernet to obtain subnetworks, making them smaller\nmodels suitable for all hardware types. We introduce a novel combination of\nthree techniques to improve the outcomes of the TODM Supernet: adaptive\ndropouts, an in-place Alpha-divergence knowledge distillation, and the use of\nScaledAdam optimizer. We validate our approach by comparing Supernet-trained\nversus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using\nLibriSpeech. Results demonstrate that our TODM Supernet either matches or\nsurpasses the performance of manually tuned models by up to a relative of 3%\nbetter in word error rate (WER), while efficiently keeping the cost of training\nmany models at a small constant.\n","authors":["Yuan Shangguan","Haichuan Yang","Danni Li","Chunyang Wu","Yassir Fathullah","Dilin Wang","Ayushi Dalmia","Raghuraman Krishnamoorthi","Ozlem Kalinli","Junteng Jia","Jay Mahadeokar","Xin Lei","Mike Seltzer","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.01947v2.pdf","comment":"Meta AI; Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.18332v2","updated":"2023-11-27T04:22:54Z","published":"2023-10-20T12:44:44Z","title":"WordArt Designer: User-Driven Artistic Typography Synthesis using Large\n Language Models","summary":" This paper introduces WordArt Designer, a user-driven framework for artistic\ntypography synthesis, relying on the Large Language Model (LLM). The system\nincorporates four key modules: the LLM Engine, SemTypo, StyTypo, and TexTypo\nmodules. 1) The LLM Engine, empowered by the LLM (e.g., GPT-3.5), interprets\nuser inputs and generates actionable prompts for the other modules, thereby\ntransforming abstract concepts into tangible designs. 2) The SemTypo module\noptimizes font designs using semantic concepts, striking a balance between\nartistic transformation and readability. 3) Building on the semantic layout\nprovided by the SemTypo module, the StyTypo module creates smooth, refined\nimages. 4) The TexTypo module further enhances the design's aesthetics through\ntexture rendering, enabling the generation of inventive textured fonts.\nNotably, WordArt Designer highlights the fusion of generative AI with artistic\ntypography. Experience its capabilities on ModelScope:\nhttps://www.modelscope.cn/studios/WordArt/WordArt.\n","authors":["Jun-Yan He","Zhi-Qi Cheng","Chenyang Li","Jingdong Sun","Wangmeng Xiang","Xianhui Lin","Xiaoyang Kang","Zengke Jin","Yusen Hu","Bin Luo","Yifeng Geng","Xuansong Xie","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.18332v2.pdf","comment":"Accepted by EMNLP 2023, 10 pages, 11 figures, 1 table, the system is\n at https://www.modelscope.cn/studios/WordArt/WordArt"},{"id":"http://arxiv.org/abs/2311.15525v1","updated":"2023-11-27T04:01:13Z","published":"2023-11-27T04:01:13Z","title":"Overview of the VLSP 2022 -- Abmusu Shared Task: A Data Challenge for\n Vietnamese Abstractive Multi-document Summarization","summary":" This paper reports the overview of the VLSP 2022 - Vietnamese abstractive\nmulti-document summarization (Abmusu) shared task for Vietnamese News. This\ntask is hosted at the 9$^{th}$ annual workshop on Vietnamese Language and\nSpeech Processing (VLSP 2022). The goal of Abmusu shared task is to develop\nsummarization systems that could create abstractive summaries automatically for\na set of documents on a topic. The model input is multiple news documents on\nthe same topic, and the corresponding output is a related abstractive summary.\nIn the scope of Abmusu shared task, we only focus on Vietnamese news\nsummarization and build a human-annotated dataset of 1,839 documents in 600\nclusters, collected from Vietnamese news in 8 categories. Participated models\nare evaluated and ranked in terms of \\texttt{ROUGE2-F1} score, the most typical\nevaluation metric for document summarization problem.\n","authors":["Mai-Vu Tran","Hoang-Quynh Le","Duy-Cat Can","Quoc-An Nguyen"],"pdf_url":"https://arxiv.org/pdf/2311.15525v1.pdf","comment":"VLSP 2022"},{"id":"http://arxiv.org/abs/2311.15513v1","updated":"2023-11-27T03:17:09Z","published":"2023-11-27T03:17:09Z","title":"A Comparative and Experimental Study on Automatic Question Answering\n Systems and its Robustness against Word Jumbling","summary":" Question answer generation using Natural Language Processing models is\nubiquitous in the world around us. It is used in many use cases such as the\nbuilding of chat bots, suggestive prompts in google search and also as a way of\nnavigating information in banking mobile applications etc. It is highly\nrelevant because a frequently asked questions (FAQ) list can only have a finite\namount of questions but a model which can perform question answer generation\ncould be able to answer completely new questions that are within the scope of\nthe data. This helps us to be able to answer new questions accurately as long\nas it is a relevant question. In commercial applications, it can be used to\nincrease customer satisfaction and ease of usage. However a lot of data is\ngenerated by humans so it is susceptible to human error and this can adversely\naffect the model's performance and we are investigating this through our work\n","authors":["Shashidhar Reddy Javaji","Haoran Hu","Sai Sameer Vennam","Vijaya Gajanan Buddhavarapu"],"pdf_url":"https://arxiv.org/pdf/2311.15513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15509v1","updated":"2023-11-27T03:08:41Z","published":"2023-11-27T03:08:41Z","title":"A Corpus for Named Entity Recognition in Chinese Novels with\n Multi-genres","summary":" Entities like person, location, organization are important for literary text\nanalysis. The lack of annotated data hinders the progress of named entity\nrecognition (NER) in literary domain. To promote the research of literary NER,\nwe build the largest multi-genre literary NER corpus containing 263,135\nentities in 105,851 sentences from 260 online Chinese novels spanning 13\ndifferent genres. Based on the corpus, we investigate characteristics of\nentities from different genres. We propose several baseline NER models and\nconduct cross-genre and cross-domain experiments. Experimental results show\nthat genre difference significantly impact NER performance though not as much\nas domain difference like literary domain and news domain. Compared with NER in\nnews domain, literary NER still needs much improvement and the\nOut-of-Vocabulary (OOV) problem is more challenging due to the high variety of\nentities in literary works.\n","authors":["Hanjie Zhao","Jinge Xie","Yuchen Yan","Yuxiang Jia","Yawen Ye","Hongying Zan"],"pdf_url":"https://arxiv.org/pdf/2311.15509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15507v1","updated":"2023-11-27T03:05:48Z","published":"2023-11-27T03:05:48Z","title":"Improving Word Sense Disambiguation in Neural Machine Translation with\n Salient Document Context","summary":" Lexical ambiguity is a challenging and pervasive problem in machine\ntranslation (\\mt). We introduce a simple and scalable approach to resolve\ntranslation ambiguity by incorporating a small amount of extra-sentential\ncontext in neural \\mt. Our approach requires no sense annotation and no change\nto standard model architectures. Since actual document context is not available\nfor the vast majority of \\mt training data, we collect related sentences for\neach input to construct pseudo-documents. Salient words from pseudo-documents\nare then encoded as a prefix to each source sentence to condition the\ngeneration of the translation. To evaluate, we release \\docmucow, a challenge\nset for translation disambiguation based on the English-German \\mucow\n\\cite{raganato-etal-2020-evaluation} augmented with document IDs. Extensive\nexperiments show that our method translates ambiguous source words better than\nstrong sentence-level baselines and comparable document-level baselines while\nreducing training costs.\n","authors":["Elijah Rippeth","Marine Carpuat","Kevin Duh","Matt Post"],"pdf_url":"https://arxiv.org/pdf/2311.15507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15500v1","updated":"2023-11-27T02:55:34Z","published":"2023-11-27T02:55:34Z","title":"Function-constrained Program Synthesis","summary":" This work introduces (1) a technique that allows large language models (LLMs)\nto leverage user-provided code when solving programming tasks and (2) a method\nto iteratively generate modular sub-functions that can aid future code\ngeneration attempts when the initial code generated by the LLM is inadequate.\nGenerating computer programs in general-purpose programming languages like\nPython poses a challenge for LLMs when instructed to use code provided in the\nprompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code\ncompletions in real-time by drawing on all code available in a development\nenvironment. However, restricting code-specific LLMs to use only in-context\ncode is not straightforward, as the model is not explicitly instructed to use\nthe user-provided code and users cannot highlight precisely which snippets of\ncode the model should incorporate into its context. Moreover, current systems\nlack effective recovery methods, forcing users to iteratively re-prompt the\nmodel with modified prompts until a sufficient solution is reached. Our method\ndiffers from traditional LLM-powered code-generation by constraining\ncode-generation to an explicit function set and enabling recovery from failed\nattempts through automatically generated sub-functions. When the LLM cannot\nproduce working code, we generate modular sub-functions to aid subsequent\nattempts at generating functional code. A by-product of our method is a library\nof reusable sub-functions that can solve related tasks, imitating a software\nteam where efficiency scales with experience. We also introduce a new\n\"half-shot\" evaluation paradigm that provides tighter estimates of LLMs' coding\nabilities compared to traditional zero-shot evaluation. Our proposed evaluation\nmethod encourages models to output solutions in a structured format, decreasing\nsyntax errors that can be mistaken for poor coding ability.\n","authors":["Patrick Hajali","Ignas Budvytis"],"pdf_url":"https://arxiv.org/pdf/2311.15500v1.pdf","comment":"17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop"},{"id":"http://arxiv.org/abs/2311.13534v2","updated":"2023-11-27T02:52:46Z","published":"2023-11-22T17:14:54Z","title":"LM-Cocktail: Resilient Tuning of Language Models via Model Merging","summary":" The pre-trained language models are continually fine-tuned to better support\ndownstream applications. However, this operation may result in significant\nperformance degeneration on general tasks beyond the targeted domain. To\novercome this problem, we propose a novel method which enables the fine-tuned\nmodel to stay resilient in general perspectives. Our method is conducted in the\nform of model merging (namely LM-Cocktail), where the fine-tuned language model\nis merged with the pre-trained base model or the peer models from other domains\nthrough weighted average. Despite simplicity, LM-Cocktail is surprisingly\neffective: the resulted model is able to achieve a strong empirical performance\nin the whole scope of general tasks while preserving a superior capacity in its\ntargeted domain. We conduct comprehensive experiments with LLama and BGE model\non popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the\nefficacy of our proposed method. The code and checkpoints are available at\nhttps://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Xingrun Xing"],"pdf_url":"https://arxiv.org/pdf/2311.13534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15490v1","updated":"2023-11-27T02:17:11Z","published":"2023-11-27T02:17:11Z","title":"Optimizing and Fine-tuning Large Language Model for Urban Renewal","summary":" This study aims to innovatively explore adaptive applications of large\nlanguage models (LLM) in urban renewal. It also aims to improve its performance\nand text generation quality for knowledge question-answering (QA) tasks. Based\non the ChatGLM, we automatically generate QA datasets using urban renewal\nscientific literature corpora in a self-instruct manner and then conduct joint\nfine-tuning training on the model using the Prefix and LoRA fine-tuning methods\nto create an LLM for urban renewal. By guiding the LLM to automatically\ngenerate QA data based on prompt words and given text, it is possible to\nquickly obtain datasets in the urban renewal field and provide data support for\nthe fine-tuning training of LLMs. The experimental results show that the joint\nfine-tuning training method proposed in this study can significantly improve\nthe performance of LLM on the QA tasks. Compared with LoRA fine-tuning, the\nmethod improves the Bleu and Rouge metrics on the test by about 5%; compared\nwith the model before fine-tuning, the method improves the Bleu and Rouge\nmetrics by about 15%-20%. This study demonstrates the effectiveness and\nsuperiority of the joint fine-tuning method using Prefix and LoRA for ChatGLM\nin the urban renewal knowledge QA tasks. It provides a new approach for\nfine-tuning LLMs on urban renewal-related tasks.\n","authors":["Xi Wang","Xianyao Ling","Tom Zhang","Xuecao Li","Shaolan Wang","Zhixing Li","Liang Zhang","Peng Gong"],"pdf_url":"https://arxiv.org/pdf/2311.15490v1.pdf","comment":"11 pages, 2 figures, 2 tables, 41 references"},{"id":"http://arxiv.org/abs/2311.15480v1","updated":"2023-11-27T01:44:02Z","published":"2023-11-27T01:44:02Z","title":"Automatic Time Signature Determination for New Scores Using Lyrics for\n Latent Rhythmic Structure","summary":" There has recently been a sharp increase in interest in Artificial\nIntelligence-Generated Content (AIGC). Despite this, musical components such as\ntime signatures have not been studied sufficiently to form an algorithmic\ndetermination approach for new compositions, especially lyrical songs. This is\nlikely because of the neglect of musical details, which is critical for\nconstructing a robust framework. Specifically, time signatures establish the\nfundamental rhythmic structure for almost all aspects of a song, including the\nphrases and notes. In this paper, we propose a novel approach that only uses\nlyrics as input to automatically generate a fitting time signature for lyrical\nsongs and uncover the latent rhythmic structure utilizing explainable machine\nlearning models. In particular, we devise multiple methods that are associated\nwith discovering lyrical patterns and creating new features that simultaneously\ncontain lyrical, rhythmic, and statistical information. In this approach, the\nbest of our experimental results reveal a 97.6% F1 score and a 0.996 Area Under\nthe Curve (AUC) of the Receiver Operating Characteristic (ROC) score. In\nconclusion, our research directly generates time signatures from lyrics\nautomatically for new scores utilizing machine learning, which is an innovative\nidea that approaches an understudied component of musicology and therefore\ncontributes significantly to the future of Artificial Intelligence (AI) music\ngeneration.\n","authors":["Callie C. Liao","Duoduo Liao","Jesse Guessford"],"pdf_url":"https://arxiv.org/pdf/2311.15480v1.pdf","comment":"Submitted to IEEE Big Data 2023 Conference"},{"id":"http://arxiv.org/abs/2305.11853v3","updated":"2023-11-27T00:42:07Z","published":"2023-05-19T17:43:58Z","title":"How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain,\n and Cross-domain Settings","summary":" Large language models (LLMs) with in-context learning have demonstrated\nremarkable capability in the text-to-SQL task. Previous research has prompted\nLLMs with various demonstration-retrieval strategies and intermediate reasoning\nsteps to enhance the performance of LLMs. However, those works often employ\nvaried strategies when constructing the prompt text for text-to-SQL inputs,\nsuch as databases and demonstration examples. This leads to a lack of\ncomparability in both the prompt constructions and their primary contributions.\nFurthermore, selecting an effective prompt construction has emerged as a\npersistent problem for future research. To address this limitation, we\ncomprehensively investigate the impact of prompt constructions across various\nsettings and provide insights into prompt constructions for future text-to-SQL\nstudies.\n","authors":["Shuaichen Chang","Eric Fosler-Lussier"],"pdf_url":"https://arxiv.org/pdf/2305.11853v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16362v1","updated":"2023-11-27T23:03:01Z","published":"2023-11-27T23:03:01Z","title":"Reducing Gender Bias in Machine Translation through Counterfactual Data\n Generation","summary":" Recent advances in neural methods have led to substantial improvement in the\nquality of Neural Machine Translation (NMT) systems. However, these systems\nfrequently produce translations with inaccurate gender (Stanovsky et al.,\n2019), which can be traced to bias in training data. Saunders and Byrne (2020)\ntackle this problem with a handcrafted dataset containing balanced gendered\nprofession words. By using this data to fine-tune an existing NMT model, they\nshow that gender bias can be significantly mitigated, albeit at the expense of\ntranslation quality due to catastrophic forgetting. They recover some of the\nlost quality with modified training objectives or additional models at\ninference. We find, however, that simply supplementing the handcrafted dataset\nwith a random sample from the base model training corpus is enough to\nsignificantly reduce the catastrophic forgetting. We also propose a novel\ndomain-adaptation technique that leverages in-domain data created with the\ncounterfactual data generation techniques proposed by Zmigrod et al. (2019) to\nfurther improve accuracy on the WinoMT challenge test set without significant\nloss in translation quality. We show its effectiveness in NMT systems from\nEnglish into three morphologically rich languages French, Spanish, and Italian.\nThe relevant dataset and code will be available at Github.\n","authors":["Ranjita Naik","Spencer Rarrick","Vishal Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2311.16362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16338v1","updated":"2023-11-27T21:54:50Z","published":"2023-11-27T21:54:50Z","title":"Releasing the CRaQAn (Coreference Resolution in Question-Answering): An\n open-source dataset and dataset creation methodology using\n instruction-following models","summary":" Instruction-following language models demand robust methodologies for\ninformation retrieval to augment instructions for question-answering\napplications. A primary challenge is the resolution of coreferences in the\ncontext of chunking strategies for long documents. The critical barrier to\nexperimentation of handling coreferences is a lack of open source datasets,\nspecifically in question-answering tasks that require coreference resolution.\nIn this work we present our Coreference Resolution in Question-Answering\n(CRaQAn) dataset, an open-source dataset that caters to the nuanced information\nretrieval requirements of coreference resolution in question-answering tasks by\nproviding over 250 question-answer pairs containing coreferences. To develop\nthis dataset, we developed a novel approach for creating high-quality datasets\nusing an instruction-following model (GPT-4) and a Recursive Criticism and\nImprovement Loop.\n","authors":["Rob Grzywinski","Joshua D'Arcy","Rob Naidoff","Ashish Shukla","Alex Browne","Ren Gibbons","Brinnae Bent"],"pdf_url":"https://arxiv.org/pdf/2311.16338v1.pdf","comment":"NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following"},{"id":"http://arxiv.org/abs/2310.17639v2","updated":"2023-11-27T21:05:54Z","published":"2023-10-26T17:54:52Z","title":"In-Context Learning Dynamics with Random Binary Sequences","summary":" Large language models (LLMs) trained on huge corpora of text datasets\ndemonstrate intriguing capabilities, achieving state-of-the-art performance on\ntasks they were not explicitly trained for. The precise nature of LLM\ncapabilities is often mysterious, and different prompts can elicit different\ncapabilities through in-context learning. We propose a framework that enables\nus to analyze in-context learning dynamics to understand latent concepts\nunderlying LLMs' behavioral patterns. This provides a more nuanced\nunderstanding than success-or-failure evaluation benchmarks, but does not\nrequire observing internal activations as a mechanistic interpretation of\ncircuits would. Inspired by the cognitive science of human randomness\nperception, we use random binary sequences as context and study dynamics of\nin-context learning by manipulating properties of context data, such as\nsequence length. In the latest GPT-3.5+ models, we find emergent abilities to\ngenerate seemingly random numbers and learn basic formal languages, with\nstriking in-context learning dynamics where model outputs transition sharply\nfrom seemingly random behaviors to deterministic repetition.\n","authors":["Eric J. Bigelow","Ekdeep Singh Lubana","Robert P. Dick","Hidenori Tanaka","Tomer D. Ullman"],"pdf_url":"https://arxiv.org/pdf/2310.17639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16302v1","updated":"2023-11-27T20:33:54Z","published":"2023-11-27T20:33:54Z","title":"Comprehensive Benchmarking of Entropy and Margin Based Scoring Metrics\n for Data Selection","summary":" While data selection methods have been studied extensively in active\nlearning, data pruning, and data augmentation settings, there is little\nevidence for the efficacy of these methods in industry scale settings,\nparticularly in low-resource languages. Our work presents ways of assessing\nprospective training examples in those settings for their \"usefulness\" or\n\"difficulty\". We also demonstrate how these measures can be used in selecting\nimportant examples for training supervised machine learning models. We\nprimarily experiment with entropy and Error L2-Norm (EL2N) scores. We use these\nmetrics to curate high quality datasets from a large pool of \\textit{Weak\nSignal Labeled} data, which assigns no-defect high confidence hypotheses during\ninference as ground truth labels. We then conduct training data augmentation\nexperiments using these de-identified datasets and demonstrate that score-based\nselection can result in a 2% decrease in semantic error rate and 4%-7% decrease\nin domain classification error rate when compared to the baseline technique of\nrandom selection.\n","authors":["Anusha Sabbineni","Nikhil Anand","Maria Minakova"],"pdf_url":"https://arxiv.org/pdf/2311.16302v1.pdf","comment":"Accepted to Efficient Natural Language and Speech Processing\n (ENLSP-III) workshop at NeurIPS '23"},{"id":"http://arxiv.org/abs/2311.16298v1","updated":"2023-11-27T20:19:22Z","published":"2023-11-27T20:19:22Z","title":"Influence Scores at Scale for Efficient Language Data Sampling","summary":" Modern ML systems ingest data aggregated from diverse sources, such as\nsynthetic, human-annotated, and live customer traffic. Understanding\n\\textit{which} examples are important to the performance of a learning\nalgorithm is crucial for efficient model training. Recently, a growing body of\nliterature has given rise to various \"influence scores,\" which use training\nartifacts such as model confidence or checkpointed gradients to identify\nimportant subsets of data. However, these methods have primarily been developed\nin computer vision settings, and it remains unclear how well they generalize to\nlanguage-based tasks using pretrained models.\n In this paper, we explore the applicability of influence scores in language\nclassification tasks. We evaluate a diverse subset of these scores on the SNLI\ndataset by quantifying accuracy changes in response to pruning training data\nthrough random and influence-score-based sampling. We then stress-test one of\nthe scores -- \"variance of gradients\" (VoG) from Agarwal et al. (2022) -- in an\nNLU model stack that was exposed to dynamic user speech patterns in a voice\nassistant type of setting. Our experiments demonstrate that in many cases,\nencoder-based language models can be finetuned on roughly 50% of the original\ndata without degradation in performance metrics. Along the way, we summarize\nlessons learned from applying out-of-the-box implementations of influence\nscores, quantify the effects of noisy and class-imbalanced data, and offer\nrecommendations on score-based sampling for better accuracy and training\nefficiency.\n","authors":["Nikhil Anand","Joshua Tan","Maria Minakova"],"pdf_url":"https://arxiv.org/pdf/2311.16298v1.pdf","comment":"Accepted at EMNLP '23"},{"id":"http://arxiv.org/abs/2311.16292v1","updated":"2023-11-27T20:10:13Z","published":"2023-11-27T20:10:13Z","title":"Student Mastery or AI Deception? Analyzing ChatGPT's Assessment\n Proficiency and Evaluating Detection Strategies","summary":" Generative AI systems such as ChatGPT have a disruptive effect on learning\nand assessment. Computer science requires practice to develop skills in problem\nsolving and programming that are traditionally developed using assignments.\nGenerative AI has the capability of completing these assignments for students\nwith high accuracy, which dramatically increases the potential for academic\nintegrity issues and students not achieving desired learning outcomes. This\nwork investigates the performance of ChatGPT by evaluating it across three\ncourses (CS1,CS2,databases). ChatGPT completes almost all introductory\nassessments perfectly. Existing detection methods, such as MOSS and JPlag\n(based on similarity metrics) and GPTzero (AI detection), have mixed success in\nidentifying AI solutions. Evaluating instructors and teaching assistants using\nheuristics to distinguish between student and AI code shows that their\ndetection is not sufficiently accurate. These observations emphasize the need\nfor adapting assessments and improved detection methods.\n","authors":["Kevin Wang","Seth Akins","Abdallah Mohammed","Ramon Lawrence"],"pdf_url":"https://arxiv.org/pdf/2311.16292v1.pdf","comment":"7 pages, Published in 2023 International Conference on Computational\n Science and Computational Intelligence Research Track on Education, IEEE CPS"},{"id":"http://arxiv.org/abs/2212.09744v2","updated":"2023-11-27T19:57:09Z","published":"2022-12-19T18:59:34Z","title":"DSI++: Updating Transformer Memory with New Documents","summary":" Differentiable Search Indices (DSIs) encode a corpus of documents in model\nparameters and use the same model to answer user queries directly. Despite the\nstrong performance of DSI models, deploying them in situations where the corpus\nchanges over time is computationally expensive because reindexing the corpus\nrequires re-training the model. In this work, we introduce DSI++, a continual\nlearning challenge for DSI to incrementally index new documents while being\nable to answer queries related to both previously and newly indexed documents.\nAcross different model scales and document identifier representations, we show\nthat continual indexing of new documents leads to considerable forgetting of\npreviously indexed documents. We also hypothesize and verify that the model\nexperiences forgetting events during training, leading to unstable learning. To\nmitigate these issues, we investigate two approaches. The first focuses on\nmodifying the training dynamics. Flatter minima implicitly alleviate\nforgetting, so we optimize for flatter loss basins and show that the model\nstably memorizes more documents ($+12\\%$). Next, we introduce a generative\nmemory to sample pseudo-queries for documents and supplement them during\ncontinual indexing to prevent forgetting for the retrieval task. Extensive\nexperiments on novel continual indexing benchmarks based on Natural Questions\n(NQ) and MS MARCO demonstrate that our proposed solution mitigates forgetting\nsignificantly. Concretely, it improves the average Hits@10 by $+21.1\\%$ over\ncompetitive baselines for NQ and requires $6$ times fewer model updates\ncompared to re-training the DSI model for incrementally indexing five corpora\nin a sequence.\n","authors":["Sanket Vaibhav Mehta","Jai Gupta","Yi Tay","Mostafa Dehghani","Vinh Q. Tran","Jinfeng Rao","Marc Najork","Emma Strubell","Donald Metzler"],"pdf_url":"https://arxiv.org/pdf/2212.09744v2.pdf","comment":"Accepted at EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2305.14726v2","updated":"2023-11-27T19:53:27Z","published":"2023-05-24T05:04:00Z","title":"In-Context Demonstration Selection with Cross Entropy Difference","summary":" Large language models (LLMs) can use in-context demonstrations to improve\nperformance on zero-shot tasks. However, selecting the best in-context examples\nis challenging because model performance can vary widely depending on the\nselected examples. We present a cross-entropy difference (CED) method for\nselecting in-context demonstrations. Our method is based on the observation\nthat the effectiveness of in-context demonstrations negatively correlates with\nthe perplexity of the test example by a language model that was finetuned on\nthat demonstration. We utilize parameter efficient finetuning to train small\nmodels on training data that are used for computing the cross-entropy\ndifference between a test example and every candidate in-context demonstration.\nThis metric is used to rank and select in-context demonstrations independently\nfor each test input. We evaluate our method on a mix-domain dataset that\ncombines 8 benchmarks, representing 4 text generation tasks, showing that CED\nfor in-context demonstration selection can improve performance for a variety of\nLLMs.\n","authors":["Dan Iter","Reid Pryzant","Ruochen Xu","Shuohang Wang","Yang Liu","Yichong Xu","Chenguang Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.14726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16267v1","updated":"2023-11-27T19:17:39Z","published":"2023-11-27T19:17:39Z","title":"Applications of Large Language Models in Data Processing: Innovative\n Approaches to Segmenting and Renewing Information","summary":" Our paper investigates effective methods for code generation in\n\"specific-domain\" applications, including the use of Large Language Models\n(LLMs) for data segmentation and renewal, as well as stimulating deeper\nthinking in LLMs through prompt adjustments. Using a real company product as an\nexample, we provide user manuals, API documentation, and other data. The ideas\ndiscussed in this paper help segment and then convert this data into semantic\nvectors to better reflect their true positioning. Subsequently, user\nrequirements are transformed into vectors to retrieve the most relevant\ncontent, achieving about 70% accuracy in simple to medium-complexity tasks\nthrough various prompt techniques. This paper is the first to enhance\nspecific-domain code generation effectiveness from this perspective.\nAdditionally, we experiment with generating more scripts from a limited number\nusing llama2-based fine-tuning to test its effectiveness in professional domain\ncode generation. This is a challenging and promising field, and once achieved,\nit will not only lead to breakthroughs in LLM development across multiple\nindustries but also enable LLMs to understand and learn any new knowledge\neffectively.\n","authors":["Yu-Chen Lin","Akhilesh Kumar","Wen-Liang Zhang","Norman Chang","Muhammad Zakir","Rucha Apte","Chao Wang","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2311.16267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16258v1","updated":"2023-11-27T19:04:37Z","published":"2023-11-27T19:04:37Z","title":"An Exploration of Left-Corner Transformations","summary":" The left-corner transformation (Rosenkrantz and Lewis, 1970) is used to\nremove left recursion from context-free grammars, which is an important step\ntowards making the grammar parsable top-down with simple techniques. This paper\ngeneralizes prior left-corner transformations to support semiring-weighted\nproduction rules and to provide finer-grained control over which left corners\nmay be moved. Our generalized left-corner transformation (GLCT) arose from\nunifying the left-corner transformation and speculation transformation (Eisner\nand Blatz, 2007), originally for logic programming. Our new transformation and\nspeculation define equivalent weighted languages. Yet, their derivation trees\nare structurally different in an important way: GLCT replaces left recursion\nwith right recursion, and speculation does not. We also provide several\ntechnical results regarding the formal relationships between the outputs of\nGLCT, speculation, and the original grammar. Lastly, we empirically investigate\nthe efficiency of GLCT for left-recursion elimination from grammars of nine\nlanguages.\n","authors":["Andreas Opedal","Eleftheria Tsipidi","Tiago Pimentel","Ryan Cotterell","Tim Vieira"],"pdf_url":"https://arxiv.org/pdf/2311.16258v1.pdf","comment":"Main conference long paper at EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.16254v1","updated":"2023-11-27T19:02:17Z","published":"2023-11-27T19:02:17Z","title":"Removing NSFW Concepts from Vision-and-Language Models for Text-to-Image\n Retrieval and Generation","summary":" Vision-and-Language models such as CLIP have demonstrated remarkable\neffectiveness across a wide range of tasks. However, these models are typically\ntrained on web-scale data, which can introduce inappropriate content and lead\nto the development of unsafe and biased behavior. This, in turn, hampers their\napplicability in sensitive and trustworthy contexts and could raise significant\nconcern in their adoption. To overcome these limitations, we introduce a\nmethodology to make Vision-and-Language models safer by removing their\nsensitivity to not-safe-for-work concepts. We show how this can be done by\ndistilling from a large language model which converts between safe and unsafe\nsentences and which is fine-tuned starting from just 100 manually-curated\npairs. We conduct extensive experiments on the resulting embedding space for\nboth retrieval and text-to-image generation, where we show that our model can\nalso be properly employed with pre-trained image generators. Our source code\nand trained models are available at: https://github.com/aimagelab/safe-clip.\n","authors":["Samuele Poppi","Tobia Poppi","Federico Cocchi","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2311.16254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16502v1","updated":"2023-11-27T17:33:21Z","published":"2023-11-27T17:33:21Z","title":"MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning\n Benchmark for Expert AGI","summary":" We introduce MMMU: a new benchmark designed to evaluate multimodal models on\nmassive multi-discipline tasks demanding college-level subject knowledge and\ndeliberate reasoning. MMMU includes 11.5K meticulously collected multimodal\nquestions from college exams, quizzes, and textbooks, covering six core\ndisciplines: Art & Design, Business, Science, Health & Medicine, Humanities &\nSocial Science, and Tech & Engineering. These questions span 30 subjects and\n183 subfields, comprising 30 highly heterogeneous image types, such as charts,\ndiagrams, maps, tables, music sheets, and chemical structures. Unlike existing\nbenchmarks, MMMU focuses on advanced perception and reasoning with\ndomain-specific knowledge, challenging models to perform tasks akin to those\nfaced by experts. Our evaluation of 14 open-source LMMs and the proprietary\nGPT-4V(ision) highlights the substantial challenges posed by MMMU. Even the\nadvanced GPT-4V only achieves a 56% accuracy, indicating significant room for\nimprovement. We believe MMMU will stimulate the community to build\nnext-generation multimodal foundation models towards expert artificial general\nintelligence.\n","authors":["Xiang Yue","Yuansheng Ni","Kai Zhang","Tianyu Zheng","Ruoqi Liu","Ge Zhang","Samuel Stevens","Dongfu Jiang","Weiming Ren","Yuxuan Sun","Cong Wei","Botao Yu","Ruibin Yuan","Renliang Sun","Ming Yin","Boyuan Zheng","Zhenzhu Yang","Yibo Liu","Wenhao Huang","Huan Sun","Yu Su","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2311.16502v1.pdf","comment":"115 pages, 99 figures"},{"id":"http://arxiv.org/abs/2311.16483v1","updated":"2023-11-27T15:20:23Z","published":"2023-11-27T15:20:23Z","title":"ChartLlama: A Multimodal LLM for Chart Understanding and Generation","summary":" Multi-modal large language models have demonstrated impressive performances\non most vision-language tasks. However, the model generally lacks the\nunderstanding capabilities for specific domain data, particularly when it comes\nto interpreting chart figures. This is mainly due to the lack of relevant\nmulti-modal instruction tuning datasets. In this article, we create a\nhigh-quality instruction-tuning dataset leveraging GPT-4. We develop a\nmulti-step data generation process in which different steps are responsible for\ngenerating tabular data, creating chart figures, and designing instruction\ntuning data separately. Our method's flexibility enables us to generate\ndiverse, high-quality instruction-tuning data consistently and efficiently\nwhile maintaining a low resource expenditure. Additionally, it allows us to\nincorporate a wider variety of chart and task types not yet featured in\nexisting datasets. Next, we introduce ChartLlama, a multi-modal large language\nmodel that we've trained using our created dataset. ChartLlama outperforms all\nprior methods in ChartQA, Chart-to-text, and Chart-extraction evaluation\nbenchmarks. Additionally, ChartLlama significantly improves upon the baseline\nin our specially compiled chart dataset, which includes new chart and task\ntypes. The results of ChartLlama confirm the value and huge potential of our\nproposed data generation method in enhancing chart comprehension.\n","authors":["Yucheng Han","Chi Zhang","Xin Chen","Xu Yang","Zhibin Wang","Gang Yu","Bin Fu","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.16483v1.pdf","comment":"Code and model on https://tingxueronghua.github.io/ChartLlama/"},{"id":"http://arxiv.org/abs/2311.16203v1","updated":"2023-11-27T08:52:10Z","published":"2023-11-27T08:52:10Z","title":"ChatTraffc: Text-to-Traffic Generation via Diffusion Model","summary":" Traffic prediction is one of the most significant foundations in Intelligent\nTransportation Systems (ITS). Traditional traffic prediction methods rely only\non historical traffic data to predict traffic trends and face two main\nchallenges. 1) insensitivity to unusual events. 2) poor performance in\nlong-term prediction. In this work, we explore how generative models combined\nwith text describing the traffic system can be applied for traffic generation\nand name the task Text-to-Traffic Generation (TTG). The key challenge of the\nTTG task is how to associate text with the spatial structure of the road\nnetwork and traffic data for generating traffic situations. To this end, we\npropose ChatTraffic, the first diffusion model for text-to-traffic generation.\nTo guarantee the consistency between synthetic and real data, we augment a\ndiffusion model with the Graph Convolutional Network (GCN) to extract spatial\ncorrelations of traffic data. In addition, we construct a large dataset\ncontaining text-traffic pairs for the TTG task. We benchmarked our model\nqualitatively and quantitatively on the released dataset. The experimental\nresults indicate that ChatTraffic can generate realistic traffic situations\nfrom the text. Our code and dataset are available at\nhttps://github.com/ChyaZhang/ChatTraffic.\n","authors":["Chengyang Zhang","Yong Zhang","Qitan Shao","Bo Li","Yisheng Lv","Xinglin Piao","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2311.16203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16201v1","updated":"2023-11-27T07:19:26Z","published":"2023-11-27T07:19:26Z","title":"Pre-trained Language Models Do Not Help Auto-regressive Text-to-Image\n Generation","summary":" Recent advances in image tokenizers, such as VQ-VAE, have enabled\ntext-to-image generation using auto-regressive methods, similar to language\nmodeling. However, these methods have yet to leverage pre-trained language\nmodels, despite their adaptability to various downstream tasks. In this work,\nwe explore this gap by adapting a pre-trained language model for\nauto-regressive text-to-image generation, and find that pre-trained language\nmodels offer limited help. We provide a two-fold explanation by analyzing\ntokens from each modality. First, we demonstrate that image tokens possess\nsignificantly different semantics compared to text tokens, rendering\npre-trained language models no more effective in modeling them than randomly\ninitialized ones. Second, the text tokens in the image-text datasets are too\nsimple compared to normal language model pre-training data, which causes the\ncatastrophic degradation of language models' capability.\n","authors":["Yuhui Zhang","Brandon McKinzie","Zhe Gan","Vaishaal Shankar","Alexander Toshev"],"pdf_url":"https://arxiv.org/pdf/2311.16201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16480v1","updated":"2023-11-27T05:05:41Z","published":"2023-11-27T05:05:41Z","title":"MI-Gen: Multiple Instance Generation of Pathology Reports for Gigapixel\n Whole-Slide Images","summary":" Whole slide images are the foundation of digital pathology for the diagnosis\nand treatment of carcinomas. Writing pathology reports is laborious and\nerror-prone for inexperienced pathologists. To reduce the workload and improve\nclinical automation, we investigate how to generate pathology reports given\nwhole slide images. On the data end, we curated the largest WSI-text dataset\n(TCGA-PathoText). In specific, we collected nearly 10000 high-quality WSI-text\npairs for visual-language models by recognizing and cleaning pathology reports\nwhich narrate diagnostic slides in TCGA. On the model end, we propose the\nmultiple instance generative model (MI-Gen) which can produce pathology reports\nfor gigapixel WSIs. We benchmark our model on the largest subset of\nTCGA-PathoText. Experimental results show our model can generate pathology\nreports which contain multiple clinical clues. Furthermore, WSI-text prediction\ncan be seen as an approach of visual-language pre-training, which enables our\nmodel to be transferred to downstream diagnostic tasks like carcinoma grading\nand phenotyping. We observe that simple semantic extraction from the pathology\nreports can achieve the best performance (0.838 of F1 score) on BRCA subtyping\nwithout adding extra parameters or tricky fine-tuning. Our collected dataset\nand related code will all be publicly available.\n","authors":["Pingyi Chen","Honglin Li","Chenglu Zhu","Sunyi Zheng","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.16480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17076v1","updated":"2023-11-27T22:23:27Z","published":"2023-11-27T22:23:27Z","title":"Compositional Chain-of-Thought Prompting for Large Multimodal Models","summary":" The combination of strong visual backbones and Large Language Model (LLM)\nreasoning has led to Large Multimodal Models (LMMs) becoming the current\nstandard for a wide range of vision and language (VL) tasks. However, recent\nresearch has shown that even the most advanced LMMs still struggle to capture\naspects of compositional visual reasoning, such as attributes and relationships\nbetween objects. One solution is to utilize scene graphs (SGs)--a formalization\nof objects and their relations and attributes that has been extensively used as\na bridge between the visual and textual domains. Yet, scene graph data requires\nscene graph annotations, which are expensive to collect and thus not easily\nscalable. Moreover, finetuning an LMM based on SG data can lead to catastrophic\nforgetting of the pretraining objective. To overcome this, inspired by\nchain-of-thought methods, we propose Compositional Chain-of-Thought (CCoT), a\nnovel zero-shot Chain-of-Thought prompting method that utilizes SG\nrepresentations in order to extract compositional knowledge from an LMM.\nSpecifically, we first generate an SG using the LMM, and then use that SG in\nthe prompt to produce a response. Through extensive experiments, we find that\nthe proposed CCoT approach not only improves LMM performance on several vision\nand language VL compositional benchmarks but also improves the performance of\nseveral popular LMMs on general multimodal benchmarks, without the need for\nfine-tuning or annotated ground-truth SGs.\n","authors":["Chancharik Mitra","Brandon Huang","Trevor Darrell","Roei Herzig"],"pdf_url":"https://arxiv.org/pdf/2311.17076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10813v3","updated":"2023-11-27T20:53:35Z","published":"2023-11-17T18:59:56Z","title":"A Language Agent for Autonomous Driving","summary":" Human-level driving is an ultimate goal of autonomous driving. Conventional\napproaches formulate autonomous driving as a perception-prediction-planning\nframework, yet their systems do not capitalize on the inherent reasoning\nability and experiential knowledge of humans. In this paper, we propose a\nfundamental paradigm shift from current pipelines, exploiting Large Language\nModels (LLMs) as a cognitive agent to integrate human-like intelligence into\nautonomous driving systems. Our approach, termed Agent-Driver, transforms the\ntraditional autonomous driving pipeline by introducing a versatile tool library\naccessible via function calls, a cognitive memory of common sense and\nexperiential knowledge for decision-making, and a reasoning engine capable of\nchain-of-thought reasoning, task planning, motion planning, and\nself-reflection. Powered by LLMs, our Agent-Driver is endowed with intuitive\ncommon sense and robust reasoning capabilities, thus enabling a more nuanced,\nhuman-like approach to autonomous driving. We evaluate our approach on the\nlarge-scale nuScenes benchmark, and extensive experiments substantiate that our\nAgent-Driver significantly outperforms the state-of-the-art driving methods by\na large margin. Our approach also demonstrates superior interpretability and\nfew-shot learning ability to these methods. Code will be released.\n","authors":["Jiageng Mao","Junjie Ye","Yuxi Qian","Marco Pavone","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2311.10813v3.pdf","comment":"Project Page: https://usc-gvl.github.io/Agent-Driver/"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.16103v1","updated":"2023-11-27T18:59:58Z","published":"2023-11-27T18:59:58Z","title":"Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating\n Video-based Large Language Models","summary":" Video-based large language models (Video-LLMs) have been recently introduced,\ntargeting both fundamental improvements in perception and comprehension, and a\ndiverse range of user inquiries. In pursuit of the ultimate goal of achieving\nartificial general intelligence, a truly intelligent Video-LLM model should not\nonly see and understand the surroundings, but also possess human-level\ncommonsense, and make well-informed decisions for the users. To guide the\ndevelopment of such a model, the establishment of a robust and comprehensive\nevaluation system becomes crucial. To this end, this paper proposes\n\\textit{Video-Bench}, a new comprehensive benchmark along with a toolkit\nspecifically designed for evaluating Video-LLMs. The benchmark comprises 10\nmeticulously crafted tasks, evaluating the capabilities of Video-LLMs across\nthree distinct levels: Video-exclusive Understanding, Prior Knowledge-based\nQuestion-Answering, and Comprehension and Decision-making. In addition, we\nintroduce an automatic toolkit tailored to process model outputs for various\ntasks, facilitating the calculation of metrics and generating convenient final\nscores. We evaluate 8 representative Video-LLMs using \\textit{Video-Bench}. The\nfindings reveal that current Video-LLMs still fall considerably short of\nachieving human-like comprehension and analysis of real-world videos, offering\nvaluable insights for future research directions. The benchmark and toolkit are\navailable at: \\url{https://github.com/PKU-YuanGroup/Video-Bench}.\n","authors":["Munan Ning","Bin Zhu","Yujia Xie","Bin Lin","Jiaxi Cui","Lu Yuan","Dongdong Chen","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.16103v1.pdf","comment":"Benchmark is available at\n https://github.com/PKU-YuanGroup/Video-Bench"},{"id":"http://arxiv.org/abs/2311.16102v1","updated":"2023-11-27T18:59:53Z","published":"2023-11-27T18:59:53Z","title":"Test-time Adaptation of Discriminative Models via Diffusion Generative\n Feedback","summary":" The advancements in generative modeling, particularly the advent of diffusion\nmodels, have sparked a fundamental question: how can these models be\neffectively used for discriminative tasks? In this work, we find that\ngenerative models can be great test-time adapters for discriminative models.\nOur method, Diffusion-TTA, adapts pre-trained discriminative models such as\nimage classifiers, segmenters and depth predictors, to each unlabelled example\nin the test set using generative feedback from a diffusion model. We achieve\nthis by modulating the conditioning of the diffusion model using the output of\nthe discriminative model. We then maximize the image likelihood objective by\nbackpropagating the gradients to discriminative model's parameters. We show\nDiffusion-TTA significantly enhances the accuracy of various large-scale\npre-trained discriminative models, such as, ImageNet classifiers, CLIP models,\nimage pixel labellers and image depth predictors. Diffusion-TTA outperforms\nexisting test-time adaptation methods, including TTT-MAE and TENT, and\nparticularly shines in online adaptation setups, where the discriminative model\nis continually adapted to each example in the test set. We provide access to\ncode, results, and visualizations on our website:\nhttps://diffusion-tta.github.io/.\n","authors":["Mihir Prabhudesai","Tsung-Wei Ke","Alexander C. Li","Deepak Pathak","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2311.16102v1.pdf","comment":"Accepted at NeurIPS 2023 Webpage with Code:\n https://diffusion-tta.github.io/"},{"id":"http://arxiv.org/abs/2311.16101v1","updated":"2023-11-27T18:59:42Z","published":"2023-11-27T18:59:42Z","title":"How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for\n Vision LLMs","summary":" This work focuses on the potential of Vision LLMs (VLLMs) in visual\nreasoning. Different from prior studies, we shift our focus from evaluating\nstandard performance to introducing a comprehensive safety evaluation suite,\ncovering both out-of-distribution (OOD) generalization and adversarial\nrobustness. For the OOD evaluation, we present two novel VQA datasets, each\nwith one variant, designed to test model performance under challenging\nconditions. In exploring adversarial robustness, we propose a straightforward\nattack strategy for misleading VLLMs to produce visual-unrelated responses.\nMoreover, we assess the efficacy of two jailbreaking strategies, targeting\neither the vision or language component of VLLMs. Our evaluation of 21 diverse\nmodels, ranging from open-source VLLMs to GPT-4V, yields interesting\nobservations: 1) Current VLLMs struggle with OOD texts but not images, unless\nthe visual information is limited; and 2) These VLLMs can be easily misled by\ndeceiving vision encoders only, and their vision-language training often\ncompromise safety protocols. We release this safety evaluation suite at\nhttps://github.com/UCSC-VLAA/vllm-safety-benchmark.\n","authors":["Haoqin Tu","Chenhang Cui","Zijun Wang","Yiyang Zhou","Bingchen Zhao","Junlin Han","Wangchunshu Zhou","Huaxiu Yao","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2311.16101v1.pdf","comment":"H.T., C.C., and Z.W. contribute equally. Work done during H.T. and\n Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC"},{"id":"http://arxiv.org/abs/2311.16099v1","updated":"2023-11-27T18:59:30Z","published":"2023-11-27T18:59:30Z","title":"GART: Gaussian Articulated Template Models","summary":" We introduce Gaussian Articulated Template Model GART, an explicit,\nefficient, and expressive representation for non-rigid articulated subject\ncapturing and rendering from monocular videos. GART utilizes a mixture of\nmoving 3D Gaussians to explicitly approximate a deformable subject's geometry\nand appearance. It takes advantage of a categorical template model prior (SMPL,\nSMAL, etc.) with learnable forward skinning while further generalizing to more\ncomplex non-rigid deformations with novel latent bones. GART can be\nreconstructed via differentiable rendering from monocular videos in seconds or\nminutes and rendered in novel poses faster than 150fps.\n","authors":["Jiahui Lei","Yufu Wang","Georgios Pavlakos","Lingjie Liu","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2311.16099v1.pdf","comment":"13 pages, code available at\n https://www.cis.upenn.edu/~leijh/projects/gart/"},{"id":"http://arxiv.org/abs/2311.16098v1","updated":"2023-11-27T18:59:25Z","published":"2023-11-27T18:59:25Z","title":"On Bringing Robots Home","summary":" Throughout history, we have successfully integrated various machines into our\nhomes. Dishwashers, laundry machines, stand mixers, and robot vacuums are a few\nrecent examples. However, these machines excel at performing only a single task\neffectively. The concept of a \"generalist machine\" in homes - a domestic\nassistant that can adapt and learn from our needs, all while remaining\ncost-effective - has long been a goal in robotics that has been steadily\npursued for decades. In this work, we initiate a large-scale effort towards\nthis goal by introducing Dobb-E, an affordable yet versatile general-purpose\nsystem for learning robotic manipulation within household settings. Dobb-E can\nlearn a new task with only five minutes of a user showing it how to do it,\nthanks to a demonstration collection tool (\"The Stick\") we built out of cheap\nparts and iPhones. We use the Stick to collect 13 hours of data in 22 homes of\nNew York City, and train Home Pretrained Representations (HPR). Then, in a\nnovel home environment, with five minutes of demonstrations and fifteen minutes\nof adapting the HPR model, we show that Dobb-E can reliably solve the task on\nthe Stretch, a mobile robot readily available on the market. Across roughly 30\ndays of experimentation in homes of New York City and surrounding areas, we\ntest our system in 10 homes, with a total of 109 tasks in different\nenvironments, and finally achieve a success rate of 81%. Beyond success\npercentages, our experiments reveal a plethora of unique challenges absent or\nignored in lab robotics. These range from effects of strong shadows, to\nvariable demonstration quality by non-expert users. With the hope of\naccelerating research on home robots, and eventually seeing robot butlers in\nevery home, we open-source Dobb-E software stack and models, our data, and our\nhardware designs at https://dobb-e.com\n","authors":["Nur Muhammad Mahi Shafiullah","Anant Rai","Haritheja Etukuru","Yiqian Liu","Ishan Misra","Soumith Chintala","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2311.16098v1.pdf","comment":"Project website and videos are available at https://dobb-e.com,\n technical documentation for getting started is available at\n https://docs.dobb-e.com, and code is released at\n https://github.com/notmahi/dobb-e"},{"id":"http://arxiv.org/abs/2311.16097v1","updated":"2023-11-27T18:59:10Z","published":"2023-11-27T18:59:10Z","title":"CG-HOI: Contact-Guided 3D Human-Object Interaction Generation","summary":" We propose CG-HOI, the first method to address the task of generating dynamic\n3D human-object interactions (HOIs) from text. We model the motion of both\nhuman and object in an interdependent fashion, as semantically rich human\nmotion rarely happens in isolation without any interactions. Our key insight is\nthat explicitly modeling contact between the human body surface and object\ngeometry can be used as strong proxy guidance, both during training and\ninference. Using this guidance to bridge human and object motion enables\ngenerating more realistic and physically plausible interaction sequences, where\nthe human body and corresponding object move in a coherent manner. Our method\nfirst learns to model human motion, object motion, and contact in a joint\ndiffusion process, inter-correlated through cross-attention. We then leverage\nthis learned contact for guidance during inference synthesis of realistic,\ncoherent HOIs. Extensive evaluation shows that our joint contact-based\nhuman-object interaction approach generates realistic and physically plausible\nsequences, and we show two applications highlighting the capabilities of our\nmethod. Conditioned on a given object trajectory, we can generate the\ncorresponding human motion without re-training, demonstrating strong\nhuman-object interdependency learning. Our approach is also flexible, and can\nbe applied to static real-world 3D scene scans.\n","authors":["Christian Diller","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2311.16097v1.pdf","comment":"Project page: https://cg-hoi.christian-diller.de Video:\n https://www.youtube.com/watch?v=GNyQwTwZ15s"},{"id":"http://arxiv.org/abs/2311.16096v1","updated":"2023-11-27T18:59:04Z","published":"2023-11-27T18:59:04Z","title":"Animatable Gaussians: Learning Pose-dependent Gaussian Maps for\n High-fidelity Human Avatar Modeling","summary":" Modeling animatable human avatars from RGB videos is a long-standing and\nchallenging problem. Recent works usually adopt MLP-based neural radiance\nfields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to\nregress pose-dependent garment details. To this end, we introduce Animatable\nGaussians, a new avatar representation that leverages powerful 2D CNNs and 3D\nGaussian splatting to create high-fidelity avatars. To associate 3D Gaussians\nwith the animatable avatar, we learn a parametric template from the input\nvideos, and then parameterize the template on two front \\& back canonical\nGaussian maps where each pixel represents a 3D Gaussian. The learned template\nis adaptive to the wearing garments for modeling looser clothes like dresses.\nSuch template-guided 2D parameterization enables us to employ a powerful\nStyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling\ndetailed dynamic appearances. Furthermore, we introduce a pose projection\nstrategy for better generalization given novel poses. Overall, our method can\ncreate lifelike avatars with dynamic, realistic and generalized appearances.\nExperiments show that our method outperforms other state-of-the-art approaches.\nCode: https://github.com/lizhe00/AnimatableGaussians\n","authors":["Zhe Li","Zerong Zheng","Lizhen Wang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16096v1.pdf","comment":"Projectpage: https://animatable-gaussians.github.io/, Code:\n https://github.com/lizhe00/AnimatableGaussians"},{"id":"http://arxiv.org/abs/2311.16094v1","updated":"2023-11-27T18:59:02Z","published":"2023-11-27T18:59:02Z","title":"Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person\n Images","summary":" Virtual try-on has become a popular research topic, but most existing methods\nfocus on studio images with a clean background. They can achieve plausible\nresults for this studio try-on setting by learning to warp a garment image to\nfit a person's body from paired training data, i.e., garment images paired with\nimages of people wearing the same garment. Such data is often collected from\ncommercial websites, where each garment is demonstrated both by itself and on\nseveral models. By contrast, it is hard to collect paired data for in-the-wild\nscenes, and therefore, virtual try-on for casual images of people against\ncluttered backgrounds is rarely studied.\n In this work, we fill the gap in the current virtual try-on research by (1)\nintroducing a Street TryOn benchmark to evaluate performance on street scenes\nand (2) proposing a novel method that can learn without paired data, from a set\nof in-the-wild person images directly. Our method can achieve robust\nperformance across shop and street domains using a novel DensePose warping\ncorrection method combined with diffusion-based inpainting controlled by pose\nand semantic segmentation. Our experiments demonstrate competitive performance\nfor standard studio try-on tasks and SOTA performance for street try-on and\ncross-domain try-on tasks.\n","authors":["Aiyu Cui","Jay Mahajan","Viraj Shah","Preeti Gomathinayagam","Svetlana Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2311.16094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16091v1","updated":"2023-11-27T18:57:42Z","published":"2023-11-27T18:57:42Z","title":"Interactive Autonomous Navigation with Internal State Inference and\n Interactivity Estimation","summary":" Deep reinforcement learning (DRL) provides a promising way for intelligent\nagents (e.g., autonomous vehicles) to learn to navigate complex scenarios.\nHowever, DRL with neural networks as function approximators is typically\nconsidered a black box with little explainability and often suffers from\nsuboptimal performance, especially for autonomous navigation in highly\ninteractive multi-agent environments. To address these issues, we propose three\nauxiliary tasks with spatio-temporal relational reasoning and integrate them\ninto the standard DRL framework, which improves the decision making performance\nand provides explainable intermediate indicators. We propose to explicitly\ninfer the internal states (i.e., traits and intentions) of surrounding agents\n(e.g., human drivers) as well as to predict their future trajectories in the\nsituations with and without the ego agent through counterfactual reasoning.\nThese auxiliary tasks provide additional supervision signals to infer the\nbehavior patterns of other interactive agents. Multiple variants of framework\nintegration strategies are compared. We also employ a spatio-temporal graph\nneural network to encode relations between dynamic entities, which enhances\nboth internal state inference and decision making of the ego agent. Moreover,\nwe propose an interactivity estimation mechanism based on the difference\nbetween predicted trajectories in these two situations, which indicates the\ndegree of influence of the ego agent on other agents. To validate the proposed\nmethod, we design an intersection driving simulator based on the Intelligent\nIntersection Driver Model (IIDM) that simulates vehicles and pedestrians. Our\napproach achieves robust and state-of-the-art performance in terms of standard\nevaluation metrics and provides explainable intermediate indicators (i.e.,\ninternal states, and interactivity scores) for decision making.\n","authors":["Jiachen Li","David Isele","Kanghoon Lee","Jinkyoo Park","Kikuo Fujimura","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2311.16091v1.pdf","comment":"18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2311.16090v1","updated":"2023-11-27T18:56:37Z","published":"2023-11-27T18:56:37Z","title":"Self-correcting LLM-controlled Diffusion Models","summary":" Text-to-image generation has witnessed significant progress with the advent\nof diffusion models. Despite the ability to generate photorealistic images,\ncurrent text-to-image diffusion models still often struggle to accurately\ninterpret and follow complex input text prompts. In contrast to existing models\nthat aim to generate images only with their best effort, we introduce\nSelf-correcting LLM-controlled Diffusion (SLD). SLD is a framework that\ngenerates an image from the input prompt, assesses its alignment with the\nprompt, and performs self-corrections on the inaccuracies in the generated\nimage. Steered by an LLM controller, SLD turns text-to-image generation into an\niterative closed-loop process, ensuring correctness in the resulting image. SLD\nis not only training-free but can also be seamlessly integrated with diffusion\nmodels behind API access, such as DALL-E 3, to further boost the performance of\nstate-of-the-art diffusion models. Experimental results show that our approach\ncan rectify a majority of incorrect generations, particularly in generative\nnumeracy, attribute binding, and spatial relationships. Furthermore, by simply\nadjusting the instructions to the LLM, SLD can perform image editing tasks,\nbridging the gap between text-to-image generation and image editing pipelines.\nWe will make our code available for future research and applications.\n","authors":["Tsung-Han Wu","Long Lian","Joseph E. Gonzalez","Boyi Li","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2311.16090v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.16081v1","updated":"2023-11-27T18:52:09Z","published":"2023-11-27T18:52:09Z","title":"ViT-Lens-2: Gateway to Omni-modal Intelligence","summary":" Aiming to advance AI agents, large foundation models significantly improve\nreasoning and instruction execution, yet the current focus on vision and\nlanguage neglects the potential of perceiving diverse modalities in open-world\nenvironments. However, the success of data-driven vision and language models is\ncostly or even infeasible to be reproduced for rare modalities. In this paper,\nwe present ViT-Lens-2 that facilitates efficient omni-modal representation\nlearning by perceiving novel modalities with a pretrained ViT and aligning them\nto a pre-defined space. Specifically, the modality-specific lens is tuned to\nproject any-modal signals to an intermediate embedding space, which are then\nprocessed by a strong ViT with pre-trained visual knowledge. The encoded\nrepresentations are optimized toward aligning with the modal-independent space,\npre-defined by off-the-shelf foundation models. ViT-Lens-2 provides a unified\nsolution for representation learning of increasing modalities with two\nappealing advantages: (i) Unlocking the great potential of pretrained ViTs to\nnovel modalities effectively with efficient data regime; (ii) Enabling emergent\ndownstream capabilities through modality alignment and shared ViT parameters.\nWe tailor ViT-Lens-2 to learn representations for 3D point cloud, depth, audio,\ntactile and EEG, and set new state-of-the-art results across various\nunderstanding tasks, such as zero-shot classification. By seamlessly\nintegrating ViT-Lens-2 into Multimodal Foundation Models, we enable\nAny-modality to Text and Image Generation in a zero-shot manner. Code and\nmodels are available at https://github.com/TencentARC/ViT-Lens.\n","authors":["Weixian Lei","Yixiao Ge","Kun Yi","Jianfeng Zhang","Difei Gao","Dylan Sun","Yuying Ge","Ying Shan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2311.16081v1.pdf","comment":"This work is a follow-up of \"ViT-Lens: Towards Omni-modal\n Representations\". arXiv admin note: text overlap with arXiv:2308.10185"},{"id":"http://arxiv.org/abs/2211.14309v2","updated":"2023-11-27T18:48:33Z","published":"2022-11-25T18:59:53Z","title":"FutureHuman3D: Forecasting Complex Long-Term 3D Human Behavior from\n Video Observations","summary":" We present a generative approach to forecast long-term future human behavior\nin 3D, requiring only weak supervision from readily available 2D human action\ndata. This is a fundamental task enabling many downstream applications. The\nrequired ground-truth data is hard to capture in 3D (mocap suits, expensive\nsetups) but easy to acquire in 2D (simple RGB cameras). Thus, we design our\nmethod to only require 2D RGB data while being able to generate 3D human motion\nsequences. We use a differentiable 2D projection scheme in an autoregressive\nmanner for weak supervision, and an adversarial loss for 3D regularization. Our\nmethod predicts long and complex behavior sequences (e.g. cooking, assembly)\nconsisting of multiple sub-actions. We tackle this in a semantically\nhierarchical manner, jointly predicting high-level coarse action labels\ntogether with their low-level fine-grained realizations as characteristic 3D\nhuman poses. We observe that these two action representations are coupled in\nnature, and joint prediction benefits both action and pose forecasting. Our\nexperiments demonstrate the complementary nature of joint action and 3D pose\nprediction: our joint approach outperforms each task treated individually,\nenables robust longer-term sequence prediction, and outperforms alternative\napproaches to forecast actions and characteristic 3D poses.\n","authors":["Christian Diller","Thomas Funkhouser","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2211.14309v2.pdf","comment":"Project Page: https://future-human-3d.christian-diller.de/ Video:\n https://www.youtube.com/watch?v=18du85YFXL0"},{"id":"http://arxiv.org/abs/2210.06462v3","updated":"2023-11-27T18:30:14Z","published":"2022-10-12T17:57:58Z","title":"Self-Guided Diffusion Models","summary":" Diffusion models have demonstrated remarkable progress in image generation\nquality, especially when guidance is used to control the generative process.\nHowever, guidance requires a large amount of image-annotation pairs for\ntraining and is thus dependent on their availability, correctness and\nunbiasedness. In this paper, we eliminate the need for such annotation by\ninstead leveraging the flexibility of self-supervision signals to design a\nframework for self-guided diffusion models. By leveraging a feature extraction\nfunction and a self-annotation function, our method provides guidance signals\nat various image granularities: from the level of holistic images to object\nboxes and even segmentation masks. Our experiments on single-label and\nmulti-label image datasets demonstrate that self-labeled guidance always\noutperforms diffusion models without guidance and may even surpass guidance\nbased on ground-truth labels, especially on unbalanced data. When equipped with\nself-supervised box or mask proposals, our method further generates visually\ndiverse yet semantically consistent images, without the need for any class,\nbox, or segment label annotation. Self-guided diffusion is simple, flexible and\nexpected to profit from deployment at scale. Source code will be at:\nhttps://taohu.me/sgdm/\n","authors":["Vincent Tao Hu","David W Zhang","Yuki M. Asano","Gertjan J. Burghouts","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2210.06462v3.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2311.16060v1","updated":"2023-11-27T18:26:19Z","published":"2023-11-27T18:26:19Z","title":"DiffSLVA: Harnessing Diffusion Models for Sign Language Video\n Anonymization","summary":" Since American Sign Language (ASL) has no standard written form, Deaf signers\nfrequently share videos in order to communicate in their native language.\nHowever, since both hands and face convey critical linguistic information in\nsigned languages, sign language videos cannot preserve signer privacy. While\nsigners have expressed interest, for a variety of applications, in sign\nlanguage video anonymization that would effectively preserve linguistic\ncontent, attempts to develop such technology have had limited success, given\nthe complexity of hand movements and facial expressions. Existing approaches\nrely predominantly on precise pose estimations of the signer in video footage\nand often require sign language video datasets for training. These requirements\nprevent them from processing videos 'in the wild,' in part because of the\nlimited diversity present in current sign language video datasets. To address\nthese limitations, our research introduces DiffSLVA, a novel methodology that\nutilizes pre-trained large-scale diffusion models for zero-shot text-guided\nsign language video anonymization. We incorporate ControlNet, which leverages\nlow-level image features such as HED (Holistically-Nested Edge Detection)\nedges, to circumvent the need for pose estimation. Additionally, we develop a\nspecialized module dedicated to capturing facial expressions, which are\ncritical for conveying essential linguistic information in signed languages. We\nthen combine the above methods to achieve anonymization that better preserves\nthe essential linguistic content of the original signer. This innovative\nmethodology makes possible, for the first time, sign language video\nanonymization that could be used for real-world applications, which would offer\nsignificant benefits to the Deaf and Hard-of-Hearing communities. We\ndemonstrate the effectiveness of our approach with a series of signer\nanonymization experiments.\n","authors":["Zhaoyang Xia","Carol Neidle","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2311.16060v1.pdf","comment":"Project webpage: https://github.com/Jeffery9707/DiffSLVA"},{"id":"http://arxiv.org/abs/2311.16052v1","updated":"2023-11-27T18:14:03Z","published":"2023-11-27T18:14:03Z","title":"Exploring Attribute Variations in Style-based GANs using Diffusion\n Models","summary":" Existing attribute editing methods treat semantic attributes as binary,\nresulting in a single edit per attribute. However, attributes such as\neyeglasses, smiles, or hairstyles exhibit a vast range of diversity. In this\nwork, we formulate the task of \\textit{diverse attribute editing} by modeling\nthe multidimensional nature of attribute edits. This enables users to generate\nmultiple plausible edits per attribute. We capitalize on disentangled latent\nspaces of pretrained GANs and train a Denoising Diffusion Probabilistic Model\n(DDPM) to learn the latent distribution for diverse edits. Specifically, we\ntrain DDPM over a dataset of edit latent directions obtained by embedding image\npairs with a single attribute change. This leads to latent subspaces that\nenable diverse attribute editing. Applying diffusion in the highly compressed\nlatent space allows us to model rich distributions of edits within limited\ncomputational resources. Through extensive qualitative and quantitative\nexperiments conducted across a range of datasets, we demonstrate the\neffectiveness of our approach for diverse attribute editing. We also showcase\nthe results of our method applied for 3D editing of various face attributes.\n","authors":["Rishubh Parihar","Prasanna Balaji","Raghav Magazine","Sarthak Vora","Tejan Karmali","Varun Jampani","R. Venkatesh Babu"],"pdf_url":"https://arxiv.org/pdf/2311.16052v1.pdf","comment":"Neurips Workshop on Diffusion Models 2023"},{"id":"http://arxiv.org/abs/2311.16043v1","updated":"2023-11-27T18:07:58Z","published":"2023-11-27T18:07:58Z","title":"Relightable 3D Gaussian: Real-time Point Cloud Relighting with BRDF\n Decomposition and Ray Tracing","summary":" We present a novel differentiable point-based rendering framework for\nmaterial and lighting decomposition from multi-view images, enabling editing,\nray-tracing, and real-time relighting of the 3D point cloud. Specifically, a 3D\nscene is represented as a set of relightable 3D Gaussian points, where each\npoint is additionally associated with a normal direction, BRDF parameters, and\nincident lights from different directions. To achieve robust lighting\nestimation, we further divide incident lights of each point into global and\nlocal components, as well as view-dependent visibilities. The 3D scene is\noptimized through the 3D Gaussian Splatting technique while BRDF and lighting\nare decomposed by physically-based differentiable rendering. Moreover, we\nintroduce an innovative point-based ray-tracing approach based on the bounding\nvolume hierarchy for efficient visibility baking, enabling real-time rendering\nand relighting of 3D Gaussian points with accurate shadow effects. Extensive\nexperiments demonstrate improved BRDF estimation and novel view rendering\nresults compared to state-of-the-art material estimation approaches. Our\nframework showcases the potential to revolutionize the mesh-based graphics\npipeline with a relightable, traceable, and editable rendering pipeline solely\nbased on point cloud. Project\npage:https://nju-3dv.github.io/projects/Relightable3DGaussian/.\n","authors":["Jian Gao","Chun Gu","Youtian Lin","Hao Zhu","Xun Cao","Li Zhang","Yao Yao"],"pdf_url":"https://arxiv.org/pdf/2311.16043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16042v1","updated":"2023-11-27T18:06:35Z","published":"2023-11-27T18:06:35Z","title":"Weakly-Supervised 3D Reconstruction of Clothed Humans via Normal Maps","summary":" We present a novel deep learning-based approach to the 3D reconstruction of\nclothed humans using weak supervision via 2D normal maps. Given a single RGB\nimage or multiview images, our network infers a signed distance function (SDF)\ndiscretized on a tetrahedral mesh surrounding the body in a rest pose.\nSubsequently, inferred pose and camera parameters are used to generate a normal\nmap from the SDF. A key aspect of our approach is the use of Marching\nTetrahedra to (uniquely) compute a triangulated surface from the SDF on the\ntetrahedral mesh, facilitating straightforward differentiation (and thus\nbackpropagation). Thus, given only ground truth normal maps (with no volumetric\ninformation ground truth information), we can train the network to produce SDF\nvalues from corresponding RGB images. Optionally, an additional multiview loss\nleads to improved results. We demonstrate the efficacy of our approach for both\nnetwork inference and 3D reconstruction.\n","authors":["Jane Wu","Diego Thomas","Ronald Fedkiw"],"pdf_url":"https://arxiv.org/pdf/2311.16042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16038v1","updated":"2023-11-27T17:59:41Z","published":"2023-11-27T17:59:41Z","title":"OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving","summary":" Understanding how the 3D scene evolves is vital for making decisions in\nautonomous driving. Most existing methods achieve this by predicting the\nmovements of object boxes, which cannot capture more fine-grained scene\ninformation. In this paper, we explore a new framework of learning a world\nmodel, OccWorld, in the 3D Occupancy space to simultaneously predict the\nmovement of the ego car and the evolution of the surrounding scenes. We propose\nto learn a world model based on 3D occupancy rather than 3D bounding boxes and\nsegmentation maps for three reasons: 1) expressiveness. 3D occupancy can\ndescribe the more fine-grained 3D structure of the scene; 2) efficiency. 3D\noccupancy is more economical to obtain (e.g., from sparse LiDAR points). 3)\nversatility. 3D occupancy can adapt to both vision and LiDAR. To facilitate the\nmodeling of the world evolution, we learn a reconstruction-based scene\ntokenizer on the 3D occupancy to obtain discrete scene tokens to describe the\nsurrounding scenes. We then adopt a GPT-like spatial-temporal generative\ntransformer to generate subsequent scene and ego tokens to decode the future\noccupancy and ego trajectory. Extensive experiments on the widely used nuScenes\nbenchmark demonstrate the ability of OccWorld to effectively model the\nevolution of the driving scenes. OccWorld also produces competitive planning\nresults without using instance and map supervision. Code:\nhttps://github.com/wzzheng/OccWorld.\n","authors":["Wenzhao Zheng","Weiliang Chen","Yuanhui Huang","Borui Zhang","Yueqi Duan","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2311.16038v1.pdf","comment":"Code is available at: https://github.com/wzzheng/OccWorld"},{"id":"http://arxiv.org/abs/2311.16037v1","updated":"2023-11-27T17:58:21Z","published":"2023-11-27T17:58:21Z","title":"GaussianEditor: Editing 3D Gaussians Delicately with Text Instructions","summary":" Recently, impressive results have been achieved in 3D scene editing with text\ninstructions based on a 2D diffusion model. However, current diffusion models\nprimarily generate images by predicting noise in the latent space, and the\nediting is usually applied to the whole image, which makes it challenging to\nperform delicate, especially localized, editing for 3D scenes. Inspired by\nrecent 3D Gaussian splatting, we propose a systematic framework, named\nGaussianEditor, to edit 3D scenes delicately via 3D Gaussians with text\ninstructions. Benefiting from the explicit property of 3D Gaussians, we design\na series of techniques to achieve delicate editing. Specifically, we first\nextract the region of interest (RoI) corresponding to the text instruction,\naligning it to 3D Gaussians. The Gaussian RoI is further used to control the\nediting process. Our framework can achieve more delicate and precise editing of\n3D scenes than previous methods while enjoying much faster training speed, i.e.\nwithin 20 minutes on a single V100 GPU, more than twice as fast as\nInstruct-NeRF2NeRF (45 minutes -- 2 hours).\n","authors":["Jiemin Fang","Junjie Wang","Xiaopeng Zhang","Lingxi Xie","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2311.16037v1.pdf","comment":"Project page: https://GaussianEditor.github.io"},{"id":"http://arxiv.org/abs/2310.06627v2","updated":"2023-11-27T16:59:39Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40\\% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09257v3","updated":"2023-11-27T16:51:40Z","published":"2023-11-14T23:07:50Z","title":"UFOGen: You Forward Once Large Scale Text-to-Image Generation via\n Diffusion GANs","summary":" Text-to-image diffusion models have demonstrated remarkable capabilities in\ntransforming textual prompts into coherent images, yet the computational cost\nof their inference remains a persistent challenge. To address this issue, we\npresent UFOGen, a novel generative model designed for ultra-fast, one-step\ntext-to-image synthesis. In contrast to conventional approaches that focus on\nimproving samplers or employing distillation techniques for diffusion models,\nUFOGen adopts a hybrid methodology, integrating diffusion models with a GAN\nobjective. Leveraging a newly introduced diffusion-GAN objective and\ninitialization with pre-trained diffusion models, UFOGen excels in efficiently\ngenerating high-quality images conditioned on textual descriptions in a single\nstep. Beyond traditional text-to-image generation, UFOGen showcases versatility\nin applications. Notably, UFOGen stands among the pioneering models enabling\none-step text-to-image generation and diverse downstream tasks, presenting a\nsignificant advancement in the landscape of efficient generative models.\n","authors":["Yanwu Xu","Yang Zhao","Zhisheng Xiao","Tingbo Hou"],"pdf_url":"https://arxiv.org/pdf/2311.09257v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16001v1","updated":"2023-11-27T16:47:09Z","published":"2023-11-27T16:47:09Z","title":"Automated Measurement of Vascular Calcification in Femoral\n Endarterectomy Patients Using Deep Learning","summary":" Atherosclerosis, a chronic inflammatory disease affecting the large arteries,\npresents a global health risk. Accurate analysis of diagnostic images, like\ncomputed tomographic angiograms (CTAs), is essential for staging and monitoring\nthe progression of atherosclerosis-related conditions, including peripheral\narterial disease (PAD). However, manual analysis of CTA images is\ntime-consuming and tedious. To address this limitation, we employed a deep\nlearning model to segment the vascular system in CTA images of PAD patients\nundergoing femoral endarterectomy surgery and to measure vascular calcification\nfrom the left renal artery to the patella. Utilizing proprietary CTA images of\n27 patients undergoing femoral endarterectomy surgery provided by Prisma Health\nMidlands, we developed a Deep Neural Network (DNN) model to first segment the\narterial system, starting from the descending aorta to the patella, and second,\nto provide a metric of arterial calcification. Our designed DNN achieved 83.4%\naverage Dice accuracy in segmenting arteries from aorta to patella, advancing\nthe state-of-the-art by 0.8%. Furthermore, our work is the first to present a\nrobust statistical analysis of automated calcification measurement in the lower\nextremities using deep learning, attaining a Mean Absolute Percentage Error\n(MAPE) of 9.5% and a correlation coefficient of 0.978 between automated and\nmanual calcification scores. These findings underscore the potential of deep\nlearning techniques as a rapid and accurate tool for medical professionals to\nassess calcification in the abdominal aorta and its branches above the patella.\nThe developed DNN model and related documentation in this project are available\nat GitHub page at https://github.com/pip-alireza/DeepCalcScoring.\n","authors":["Alireza Bagheri Rajeoni","Breanna Pederson","Daniel G. Clair","Susan M. Lessner","Homayoun Valafar"],"pdf_url":"https://arxiv.org/pdf/2311.16001v1.pdf","comment":"Published in MDPI Diagnostic journal, the code can be accessed via\n the GitHub link in the paper"},{"id":"http://arxiv.org/abs/2310.10541v2","updated":"2023-11-27T16:45:18Z","published":"2023-10-16T16:13:53Z","title":"AST: Effective Dataset Distillation through Alignment with Smooth and\n High-Quality Expert Trajectories","summary":" Training large AI models typically requires large-scale datasets in the\nmachine learning process, making training and parameter-tuning process both\ntime-consuming and costly. Some researchers address this problem by carefully\nsynthesizing a very small number of highly representative and informative\nsamples from real-world datasets. This approach, known as Dataset Distillation\n(DD), proposes a perspective for data-efficient learning. Despite recent\nprogress in this field, the performance of existing methods still cannot meet\nexpectations, and distilled datasets cannot effectively replace original\ndatasets. In this paper, unlike previous methods that focus solely on improving\nthe effectiveness of student distillation, we recognize and leverage the\nimportant mutual influence between expert and student models. We observed that\nthe smoothness of expert trajectories has a significant impact on subsequent\nstudent parameter alignment. Based on this, we propose an effective DD\nframework named AST, standing for Alignment with Smooth and high-quality expert\nTrajectories. We devise the integration of clipping loss and gradient penalty\nto regulate the rate of parameter changes in expert trajectory generation. To\nfurther refine the student parameter alignment with expert trajectory, we put\nforward representative initialization for the synthetic dataset and balanced\ninner-loop loss in response to the sensitivity exhibited towards randomly\ninitialized variables during distillation. We also propose two enhancement\nstrategies, namely intermediate matching loss and weight perturbation, to\nmitigate the potential occurrence of cumulative errors. We conduct extensive\nexperiments on datasets of different scales, sizes, and resolutions. The\nresults demonstrate that the proposed method significantly outperforms prior\nmethods.\n","authors":["Jiyuan Shen","Wenzhuo Yang","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2310.10541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15994v1","updated":"2023-11-27T16:43:37Z","published":"2023-11-27T16:43:37Z","title":"Adversaral Doodles: Interpretable and Human-drawable Attacks Provide\n Describable Insights","summary":" DNN-based image classification models are susceptible to adversarial attacks.\nMost previous adversarial attacks do not focus on the interpretability of the\ngenerated adversarial examples, and we cannot gain insights into the mechanism\nof the target classifier from the attacks. Therefore, we propose Adversarial\nDoodles, which have interpretable shapes. We optimize black b\\'ezier curves to\nfool the target classifier by overlaying them onto the input image. By\nintroducing random perspective transformation and regularizing the doodled\narea, we obtain compact attacks that cause misclassification even when humans\nreplicate them by hand. Adversarial doodles provide describable and intriguing\ninsights into the relationship between our attacks and the classifier's output.\nWe utilize adversarial doodles and discover the bias inherent in the target\nclassifier, such as \"We add two strokes on its head, a triangle onto its body,\nand two lines inside the triangle on a bird image. Then, the classifier\nmisclassifies the image as a butterfly.\"\n","authors":["Ryoya Nara","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2311.15994v1.pdf","comment":"Submitted to CVPR 2024"},{"id":"http://arxiv.org/abs/2311.15993v1","updated":"2023-11-27T16:41:31Z","published":"2023-11-27T16:41:31Z","title":"Unified Batch Normalization: Identifying and Alleviating the Feature\n Condensation in Batch Normalization and a Unified Framework","summary":" Batch Normalization (BN) has become an essential technique in contemporary\nneural network design, enhancing training stability. Specifically, BN employs\ncentering and scaling operations to standardize features along the batch\ndimension and uses an affine transformation to recover features. Although\nstandard BN has shown its capability to improve deep neural network training\nand convergence, it still exhibits inherent limitations in certain cases. Most\nexisting techniques that enhance BN consider a single or a few aspects of BN.\nIn this paper, we first identify problems with BN from a feature perspective\nand explore that feature condensation exists in the learning when employing BN,\nwhich negatively affects testing performance. To tackle this problem, we\npropose a two-stage unified framework called Unified Batch Normalization (UBN).\nIn the first stage, we utilize a simple feature condensation threshold to\nalleviate the feature condensation, which hinders inappropriate statistic\nupdates in normalization. In the second stage, we unify various normalization\nvariants to boost each component of BN. Our experimental results reveal that\nUBN significantly enhances performance across different visual backbones and\nnotably expedites network training convergence, particularly in early training\nstages. Notably, our method improved about 3% in top-1 accuracy on ImageNet\nclassification with large batch sizes, showing the effectiveness of our\napproach in real-world scenarios.\n","authors":["Shaobo Wang","Xiangdong Zhang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2311.15993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15991v1","updated":"2023-11-27T16:40:09Z","published":"2023-11-27T16:40:09Z","title":"DiffAnt: Diffusion Models for Action Anticipation","summary":" Anticipating future actions is inherently uncertain. Given an observed video\nsegment containing ongoing actions, multiple subsequent actions can plausibly\nfollow. This uncertainty becomes even larger when predicting far into the\nfuture. However, the majority of existing action anticipation models adhere to\na deterministic approach, neglecting to account for future uncertainties. In\nthis work, we rethink action anticipation from a generative view, employing\ndiffusion models to capture different possible future actions. In this\nframework, future actions are iteratively generated from standard Gaussian\nnoise in the latent space, conditioned on the observed video, and subsequently\ntransitioned into the action space. Extensive experiments on four benchmark\ndatasets, i.e., Breakfast, 50Salads, EpicKitchens, and EGTEA Gaze+, are\nperformed and the proposed method achieves superior or comparable results to\nstate-of-the-art methods, showing the effectiveness of a generative approach\nfor action anticipation. Our code and trained models will be published on\nGitHub.\n","authors":["Zeyun Zhong","Chengzhi Wu","Manuel Martin","Michael Voit","Juergen Gall","Jürgen Beyerer"],"pdf_url":"https://arxiv.org/pdf/2311.15991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12144v4","updated":"2023-11-27T16:38:44Z","published":"2023-11-20T19:45:27Z","title":"Applications of Large Scale Foundation Models for Autonomous Driving","summary":" Since DARPA Grand Challenges (rural) in 2004/05 and Urban Challenges in 2007,\nautonomous driving has been the most active field of AI applications. Recently\npowered by large language models (LLMs), chat systems, such as chatGPT and\nPaLM, emerge and rapidly become a promising direction to achieve artificial\ngeneral intelligence (AGI) in natural language processing (NLP). There comes a\nnatural thinking that we could employ these abilities to reformulate autonomous\ndriving. By combining LLM with foundation models, it is possible to utilize the\nhuman knowledge, commonsense and reasoning to rebuild autonomous driving\nsystems from the current long-tailed AI dilemma. In this paper, we investigate\nthe techniques of foundation models and LLMs applied for autonomous driving,\ncategorized as simulation, world model, data annotation and planning or E2E\nsolutions etc.\n","authors":["Yu Huang","Yue Chen","Zhu Li"],"pdf_url":"https://arxiv.org/pdf/2311.12144v4.pdf","comment":"22 pages. arXiv admin note: text overlap with arXiv:2304.03589 by\n other authors"},{"id":"http://arxiv.org/abs/2311.15980v1","updated":"2023-11-27T16:26:54Z","published":"2023-11-27T16:26:54Z","title":"Direct2.5: Diverse Text-to-3D Generation via Multi-view 2.5D Diffusion","summary":" Recent advances in generative AI have unveiled significant potential for the\ncreation of 3D content. However, current methods either apply a pre-trained 2D\ndiffusion model with the time-consuming score distillation sampling (SDS), or a\ndirect 3D diffusion model trained on limited 3D data losing generation\ndiversity. In this work, we approach the problem by employing a multi-view 2.5D\ndiffusion fine-tuned from a pre-trained 2D diffusion model. The multi-view 2.5D\ndiffusion directly models the structural distribution of 3D data, while still\nmaintaining the strong generalization ability of the original 2D diffusion\nmodel, filling the gap between 2D diffusion-based and direct 3D diffusion-based\nmethods for 3D content generation. During inference, multi-view normal maps are\ngenerated using the 2.5D diffusion, and a novel differentiable rasterization\nscheme is introduced to fuse the almost consistent multi-view normal maps into\na consistent 3D model. We further design a normal-conditioned multi-view image\ngeneration module for fast appearance generation given the 3D geometry. Our\nmethod is a one-pass diffusion process and does not require any SDS\noptimization as post-processing. We demonstrate through extensive experiments\nthat, our direct 2.5D generation with the specially-designed fusion scheme can\nachieve diverse, mode-seeking-free, and high-fidelity 3D content generation in\nonly 10 seconds. Project page: https://nju-3dv.github.io/projects/direct25.\n","authors":["Yuanxun Lu","Jingyang Zhang","Shiwei Li","Tian Fang","David McKinnon","Yanghai Tsin","Long Quan","Xun Cao","Yao Yao"],"pdf_url":"https://arxiv.org/pdf/2311.15980v1.pdf","comment":"Project webpage: https://nju-3dv.github.io/projects/direct25"},{"id":"http://arxiv.org/abs/2304.00553v3","updated":"2023-11-27T16:24:59Z","published":"2023-04-02T15:04:43Z","title":"From Isolated Islands to Pangea: Unifying Semantic Space for Human\n Action Understanding","summary":" As a vital step toward the intelligent agent, Action understanding matters\nfor intelligent agents and has attracted long-term attention. It can be formed\nas the mapping from the action physical space to the semantic space. Typically,\nresearchers built action datasets according to idiosyncratic choices to define\nclasses and push the envelope of benchmarks respectively. Thus, datasets are\nincompatible with each other like \"Isolated Islands\" due to semantic gaps and\nvarious class granularities, e.g., do housework in dataset A and wash plate in\ndataset B. We argue that a more principled semantic space is an urgent need to\nconcentrate the community efforts and enable us to use all datasets together to\npursue generalizable action learning. To this end, we design a structured\naction semantic space in view of verb taxonomy hierarchy and covering massive\nactions. By aligning the classes of previous datasets to our semantic space, we\ngather (image/video/skeleton/MoCap) datasets into a unified database in a\nunified label system, i.e., bridging ``isolated islands'' into a \"Pangea\".\nAccordingly, we propose a novel model mapping from the physical space to\nsemantic space to fully use Pangea. In extensive experiments, our new system\nshows significant superiority, especially in transfer learning. Code and data\nwill be made publicly available.\n","authors":["Yong-Lu Li","Xiaoqian Wu","Xinpeng Liu","Zehao Wang","Yiming Dou","Yikun Ji","Junyi Zhang","Yixing Li","Jingru Tan","Xudong Lu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00553v3.pdf","comment":"Project Webpage: https://mvig-rhos.com/pangea"},{"id":"http://arxiv.org/abs/2311.15977v1","updated":"2023-11-27T16:23:01Z","published":"2023-11-27T16:23:01Z","title":"Text2Loc: 3D Point Cloud Localization from Natural Language","summary":" We tackle the problem of 3D point cloud localization based on a few natural\nlinguistic descriptions and introduce a novel neural network, Text2Loc, that\nfully interprets the semantic relationship between points and text. Text2Loc\nfollows a coarse-to-fine localization pipeline: text-submap global place\nrecognition, followed by fine localization. In global place recognition,\nrelational dynamics among each textual hint are captured in a hierarchical\ntransformer with max-pooling (HTM), whereas a balance between positive and\nnegative pairs is maintained using text-submap contrastive learning. Moreover,\nwe propose a novel matching-free fine localization method to further refine the\nlocation predictions, which completely removes the need for complicated\ntext-instance matching and is lighter, faster, and more accurate than previous\nmethods. Extensive experiments show that Text2Loc improves the localization\naccuracy by up to $2\\times$ over the state-of-the-art on the KITTI360Pose\ndataset. We will make the code publicly available.\n","authors":["Yan Xia","Letian Shi","Zifeng Ding","João F. Henriques","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2311.15977v1.pdf","comment":"10 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.14809v2","updated":"2023-11-27T16:09:03Z","published":"2023-09-26T10:14:44Z","title":"ENIGMA-51: Towards a Fine-Grained Understanding of Human-Object\n Interactions in Industrial Scenarios","summary":" ENIGMA-51 is a new egocentric dataset acquired in an industrial scenario by\n19 subjects who followed instructions to complete the repair of electrical\nboards using industrial tools (e.g., electric screwdriver) and equipments\n(e.g., oscilloscope). The 51 egocentric video sequences are densely annotated\nwith a rich set of labels that enable the systematic study of human behavior in\nthe industrial domain. We provide benchmarks on four tasks related to human\nbehavior: 1) untrimmed temporal detection of human-object interactions, 2)\negocentric human-object interaction detection, 3) short-term object interaction\nanticipation and 4) natural language understanding of intents and entities.\nBaseline results show that the ENIGMA-51 dataset poses a challenging benchmark\nto study human behavior in industrial scenarios. We publicly release the\ndataset at https://iplab.dmi.unict.it/ENIGMA-51.\n","authors":["Francesco Ragusa","Rosario Leonardi","Michele Mazzamuto","Claudia Bonanno","Rosario Scavo","Antonino Furnari","Giovanni Maria Farinella"],"pdf_url":"https://arxiv.org/pdf/2309.14809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15965v1","updated":"2023-11-27T16:07:39Z","published":"2023-11-27T16:07:39Z","title":"FALCON: Fairness Learning via Contrastive Attention Approach to\n Continual Semantic Scene Understanding in Open World","summary":" Continual Learning in semantic scene segmentation aims to continually learn\nnew unseen classes in dynamic environments while maintaining previously learned\nknowledge. Prior studies focused on modeling the catastrophic forgetting and\nbackground shift challenges in continual learning. However, fairness, another\nmajor challenge that causes unfair predictions leading to low performance among\nmajor and minor classes, still needs to be well addressed. In addition, prior\nmethods have yet to model the unknown classes well, thus resulting in producing\nnon-discriminative features among unknown classes. This paper presents a novel\nFairness Learning via Contrastive Attention Approach to continual learning in\nsemantic scene understanding. In particular, we first introduce a new Fairness\nContrastive Clustering loss to address the problems of catastrophic forgetting\nand fairness. Then, we propose an attention-based visual grammar approach to\neffectively model the background shift problem and unknown classes, producing\nbetter feature representations for different unknown classes. Through our\nexperiments, our proposed approach achieves State-of-the-Art (SOTA) performance\non different continual learning settings of three standard benchmarks, i.e.,\nADE20K, Cityscapes, and Pascal VOC. It promotes the fairness of the continual\nsemantic segmentation model.\n","authors":["Thanh-Dat Truong","Utsav Prabhu","Bhiksha Raj","Jackson Cothren","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2311.15965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15964v1","updated":"2023-11-27T16:07:37Z","published":"2023-11-27T16:07:37Z","title":"Efficient Pre-training for Localized Instruction Generation of Videos","summary":" Procedural videos show step-by-step demonstrations of tasks like recipe\npreparation. Understanding such videos is challenging, involving the precise\nlocalization of steps and the generation of textual instructions. Manually\nannotating steps and writing instructions is costly, which limits the size of\ncurrent datasets and hinders effective learning. Leveraging large but noisy\nvideo-transcript datasets for pre-training can boost performance, but demands\nsignificant computational resources. Furthermore, transcripts contain\nirrelevant content and exhibit style variation compared to instructions written\nby human annotators. To mitigate both issues, we propose a technique,\nSieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters\nirrelevant transcripts and (ii) Swap enhances the quality of the text\ninstruction by automatically replacing the transcripts with human-written\ninstructions from a text-only recipe dataset. The curated dataset, three orders\nof magnitude smaller than current web-scale datasets, enables efficient\ntraining of large-scale models with competitive performance. We complement our\nSieve-\\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step\nlocalization and instruction generation for procedural videos. When this model\nis pre-trained on our curated dataset, it achieves state-of-the-art performance\nin zero-shot and finetuning settings on YouCook2 and Tasty, while using a\nfraction of the computational resources.\n","authors":["Anil Batra","Davide Moltisanti","Laura Sevilla-Lara","Marcus Rohrbach","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2311.15964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15963v1","updated":"2023-11-27T16:07:34Z","published":"2023-11-27T16:07:34Z","title":"From Pixels to Titles: Video Game Identification by Screenshots using\n Convolutional Neural Networks","summary":" This paper investigates video game identification through single screenshots,\nutilizing five convolutional neural network (CNN) architectures (MobileNet,\nDenseNet, EfficientNetB0, EfficientNetB2, and EfficientNetB3) across 22 home\nconsole systems, spanning from Atari 2600 to PlayStation 5. Confirming the\nhypothesis, CNNs autonomously extract image features, enabling the\nidentification of game titles from screenshots without additional features.\nUsing ImageNet pre-trained weights, EfficientNetB3 achieves the highest average\naccuracy (74.51%), while DenseNet169 excels in 14 of the 22 systems. Employing\nalternative initial weights from another screenshots dataset boosts accuracy\nfor EfficientNetB2 and EfficientNetB3, with the latter reaching a peak accuracy\nof 76.36% and demonstrating reduced convergence epochs from 23.7 to 20.5 on\naverage. Overall, the combination of optimal architecture and weights attains\n77.67% accuracy, primarily led by EfficientNetB3 in 19 systems. These findings\nunderscore the efficacy of CNNs in video game identification through\nscreenshots.\n","authors":["Fabricio Breve"],"pdf_url":"https://arxiv.org/pdf/2311.15963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10093v2","updated":"2023-11-27T15:58:30Z","published":"2023-11-16T18:59:51Z","title":"The Chosen One: Consistent Characters in Text-to-Image Diffusion Models","summary":" Recent advances in text-to-image generation models have unlocked vast\npotential for visual creativity. However, these models struggle with generation\nof consistent characters, a crucial aspect for numerous real-world applications\nsuch as story visualization, game development asset design, advertising, and\nmore. Current methods typically rely on multiple pre-existing images of the\ntarget character or involve labor-intensive manual processes. In this work, we\npropose a fully automated solution for consistent character generation, with\nthe sole input being a text prompt. We introduce an iterative procedure that,\nat each stage, identifies a coherent set of images sharing a similar identity\nand extracts a more consistent identity from this set. Our quantitative\nanalysis demonstrates that our method strikes a better balance between prompt\nalignment and identity consistency compared to the baseline methods, and these\nfindings are reinforced by a user study. To conclude, we showcase several\npractical applications of our approach. Project page is available at\nhttps://omriavrahami.com/the-chosen-one\n","authors":["Omri Avrahami","Amir Hertz","Yael Vinker","Moab Arar","Shlomi Fruchter","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2311.10093v2.pdf","comment":"Project page is available at https://omriavrahami.com/the-chosen-one"},{"id":"http://arxiv.org/abs/2311.15941v1","updated":"2023-11-27T15:49:29Z","published":"2023-11-27T15:49:29Z","title":"Tell2Design: A Dataset for Language-Guided Floor Plan Generation","summary":" We consider the task of generating designs directly from natural language\ndescriptions, and consider floor plan generation as the initial research area.\nLanguage conditional generative models have recently been very successful in\ngenerating high-quality artistic images. However, designs must satisfy\ndifferent constraints that are not present in generating artistic images,\nparticularly spatial and relational constraints. We make multiple contributions\nto initiate research on this task. First, we introduce a novel dataset,\n\\textit{Tell2Design} (T2D), which contains more than $80k$ floor plan designs\nassociated with natural language instructions. Second, we propose a\nSequence-to-Sequence model that can serve as a strong baseline for future\nresearch. Third, we benchmark this task with several text-conditional image\ngeneration models. We conclude by conducting human evaluations on the generated\nsamples and providing an analysis of human performance. We hope our\ncontributions will propel the research on language-guided design generation\nforward.\n","authors":["Sicong Leng","Yang Zhou","Mohammed Haroon Dupty","Wee Sun Lee","Sam Conrad Joyce","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2311.15941v1.pdf","comment":"Paper published in ACL2023; Area Chair Award; Best Paper Nomination"},{"id":"http://arxiv.org/abs/2311.15939v1","updated":"2023-11-27T15:46:47Z","published":"2023-11-27T15:46:47Z","title":"Unleashing the Power of Prompt-driven Nucleus Instance Segmentation","summary":" Nuclear instance segmentation in histology images is crucial for a broad\nspectrum of clinical applications. Current prevailing nuclear instance\nsegmentation algorithms rely on regression of nuclei contours, distance maps,\nwatershed markers or a proxy nuclear representation of star-convex polygons.\nConsequently, these methods necessitate sophisticated post-processing\noperations to distinguish nuclei instances, which are commonly acknowledged to\nbe error-prone and parameter-sensitive. Recently, the segment anything model\n(SAM) has earned attracted huge attention within the domain of medical image\nsegmentation due to its impressive generalization ability and promptable\nproperty. Nevertheless, its potential on nuclear instance segmentation remains\nlargely underexplored. In this paper, we present a novel prompt-driven\nframework that consists of a point prompter and a SAM for automatic nuclei\ninstance segmentation. Specifically, the prompter learns to generate a unique\npoint prompt for each nucleus while the SAM is fine tuned to output the\ncorresponding mask of the cued nucleus. Furthermore, we propose to add adjacent\nnuclei as negative prompts to promote the model's ability to recognize\noverlapping nuclei. Without bells and whistles, our proposed method sets a new\nstate-of-the-art performance on three challenging benchmarks. Our code is\navailable at\n\\textcolor{magenta}{\\url{https://github.com/windygoo/PromptNucSeg}} .\n","authors":["Zhongyi Shui","Yunlong Zhang","Kai Yao","Chenglu Zhu","Yuxuan Sun","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15937v1","updated":"2023-11-27T15:46:19Z","published":"2023-11-27T15:46:19Z","title":"Optimal Transport Aggregation for Visual Place Recognition","summary":" The task of Visual Place Recognition (VPR) aims to match a query image\nagainst references from an extensive database of images from different places,\nrelying solely on visual cues. State-of-the-art pipelines focus on the\naggregation of features extracted from a deep backbone, in order to form a\nglobal descriptor for each image. In this context, we introduce SALAD (Sinkhorn\nAlgorithm for Locally Aggregated Descriptors), which reformulates NetVLAD's\nsoft-assignment of local features to clusters as an optimal transport problem.\nIn SALAD, we consider both feature-to-cluster and cluster-to-feature relations\nand we also introduce a 'dustbin' cluster, designed to selectively discard\nfeatures deemed non-informative, enhancing the overall descriptor quality.\nAdditionally, we leverage and fine-tune DINOv2 as a backbone, which provides\nenhanced description power for the local features, and dramatically reduces the\nrequired training time. As a result, our single-stage method not only surpasses\nsingle-stage baselines in public VPR datasets, but also surpasses two-stage\nmethods that add a re-ranking with significantly higher cost. Code and models\nare available at https://github.com/serizba/salad.\n","authors":["Sergio Izquierdo","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2311.15937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15916v1","updated":"2023-11-27T15:24:54Z","published":"2023-11-27T15:24:54Z","title":"ADM-Loc: Actionness Distribution Modeling for Point-supervised Temporal\n Action Localization","summary":" This paper addresses the challenge of point-supervised temporal action\ndetection, in which only one frame per action instance is annotated in the\ntraining set. Self-training aims to provide supplementary supervision for the\ntraining process by generating pseudo-labels (action proposals) from a base\nmodel. However, most current methods generate action proposals by applying\nmanually designed thresholds to action classification probabilities and\ntreating adjacent snippets as independent entities. As a result, these methods\nstruggle to generate complete action proposals, exhibit sensitivity to\nfluctuations in action classification scores, and generate redundant and\noverlapping action proposals. This paper proposes a novel framework termed\nADM-Loc, which stands for Actionness Distribution Modeling for point-supervised\naction Localization. ADM-Loc generates action proposals by fitting a composite\ndistribution, comprising both Gaussian and uniform distributions, to the action\nclassification signals. This fitting process is tailored to each action class\npresent in the video and is applied separately for each action instance,\nensuring the distinctiveness of their distributions. ADM-Loc significantly\nenhances the alignment between the generated action proposals and ground-truth\naction instances and offers high-quality pseudo-labels for self-training.\nMoreover, to model action boundary snippets, it enforces consistency in action\nclassification scores during training by employing Gaussian kernels, supervised\nwith the proposed loss functions. ADM-Loc outperforms the state-of-the-art\npoint-supervised methods on THUMOS14 and ActivityNet-v1.2 datasets.\n","authors":["Elahe Vahdani","Yingli Tian"],"pdf_url":"https://arxiv.org/pdf/2311.15916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01908v2","updated":"2023-11-27T15:23:27Z","published":"2023-11-03T13:38:42Z","title":"LLM-driven Multimodal Target Volume Contouring in Radiation Oncology","summary":" Target volume contouring for radiation therapy is considered significantly\nmore challenging than the normal organ segmentation tasks as it necessitates\nthe utilization of both image and text-based clinical information. Inspired by\nthe recent advancement of large language models (LLMs) that can facilitate the\nintegration of the textural information and images, here we present a novel\nLLM-driven multi-modal AI that utilizes the clinical text information and is\napplicable to the challenging task of target volume contouring for radiation\ntherapy, and validate it within the context of breast cancer radiation therapy\ntarget volume contouring. Using external validation and data-insufficient\nenvironments, which attributes highly conducive to real-world applications, we\ndemonstrate that the proposed model exhibits markedly improved performance\ncompared to conventional vision-only AI models, particularly exhibiting robust\ngeneralization performance and data-efficiency. To our best knowledge, this is\nthe first LLM-driven multimodal AI model that integrates the clinical text\ninformation into target volume delineation for radiation oncology.\n","authors":["Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Jin Sung Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.01908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15914v1","updated":"2023-11-27T15:23:25Z","published":"2023-11-27T15:23:25Z","title":"Computer Vision for Carriers: PATRIOT","summary":" Deck tracking performed on carriers currently involves a team of sailors\nmanually identifying aircraft and updating a digital user interface called the\nOuija Board. Improvements to the deck tracking process would result in\nincreased Sortie Generation Rates, and therefore applying automation is seen as\na critical method to improve deck tracking. However, the requirements on a\ncarrier ship do not allow for the installation of hardware-based location\nsensing technologies like Global Positioning System (GPS) sensors. PATRIOT\n(Panoramic Asset Tracking of Real-Time Information for the Ouija Tabletop) is a\nresearch effort and proposed solution to performing deck tracking with passive\nsensing and without the need for GPS sensors. PATRIOT is a prototype system\nwhich takes existing camera feeds, calculates aircraft poses, and updates a\nvirtual Ouija board interface with the current status of the assets. PATRIOT\nwould allow for faster, more accurate, and less laborious asset tracking for\naircraft, people, and support equipment. PATRIOT is anticipated to benefit the\nwarfighter by reducing cognitive workload, reducing manning requirements,\ncollecting data to improve logistics, and enabling an automation gateway for\nfuture efforts to improve efficiency and safety. The authors have developed and\ntested algorithms to perform pose estimations of assets in real-time including\nOpenPifPaf, High-Resolution Network (HRNet), HigherHRNet (HHRNet), Faster\nR-CNN, and in-house developed encoder-decoder network. The software was tested\nwith synthetic and real-world data and was able to accurately extract the pose\nof assets. Fusion, tracking, and real-world generality are planned to be\nimproved to ensure a successful transition to the fleet.\n","authors":["Ari Goodman","Gurpreet Singh","James Hing","Ryan O'Shea"],"pdf_url":"https://arxiv.org/pdf/2311.15914v1.pdf","comment":"8 pages, 18 figures. Published in the Proceedings of the ASNE 2023\n Technology, Systems & Ships Symposium. Reproduced with permission from the\n American Society of Naval Engineers. Distribution Statement A: Approved for\n public release; distribution is unlimited, as submitted under NAVAIR Public\n Release Authorization 2023-019"},{"id":"http://arxiv.org/abs/2311.15912v1","updated":"2023-11-27T15:22:17Z","published":"2023-11-27T15:22:17Z","title":"LIFT OFF: LoRaWAN Installation and Fiducial Tracking Operations for the\n Flightline of the Future","summary":" Real-time situational awareness for the location of assets is critical to\nensure missions are completed efficiently and requirements are satisfied. In\nmany commercial settings, the application of global positioning system (GPS)\nsensors is appropriate to achieve timely knowledge of the position of people\nand equipment. However, GPS sensors are not appropriate for all situations due\nto flight clearance and operations security concerns. LIFT OFF: LoRaWAN\nInstallation and Fiducial Tracking Operations for the Flightline of the Future\nproposes a hybrid framework solution to achieve real-time situational awareness\nfor people, support equipment, and aircraft positions regardless of the\nenvironment. This framework included a machine-vision component, which involved\nsetting up cameras to detect AprilTag decals that were installed on the sides\nof aircraft. The framework included a geolocation sensor component, which\ninvolved installing GPS sensors on support equipment and helmets. The framework\nalso included creating a long-range wide area network (LoRaWAN) to transfer\ndata and developing a user interface to display the data. The framework was\ntested at Naval Air Station Oceana Flightline, the United States Naval Test\nPilot School, and at Naval Air Warfare Center Aircraft Division Lakehurst. LIFT\nOFF successfully provided a real-time updating map of all tracked assets using\nGPS sensors for people and support equipment and with visual fiducials for\naircraft. The trajectories of the assets were recorded for logistical analysis\nand playback. Future follow-on work is anticipated to apply the technology to\nother environments including carriers and amphibious assault ships in addition\nto the flightline.\n","authors":["Ari Goodman","Ryan O'Shea"],"pdf_url":"https://arxiv.org/pdf/2311.15912v1.pdf","comment":"6 pages, 11 figures. Published in the Proceedings of the ASNE 2023\n Technology, Systems & Ships Symposium. Reproduced with permission from the\n American Society of Naval Engineers. Distribution Statement A: Approved for\n public release; distribution is unlimited, as submitted under NAVAIR Public\n Release Authorization 2023-020"},{"id":"http://arxiv.org/abs/2209.13204v2","updated":"2023-11-27T15:19:00Z","published":"2022-09-27T07:10:20Z","title":"NEURAL MARIONETTE: A Transformer-based Multi-action Human Motion\n Synthesis System","summary":" We present a neural network-based system for long-term, multi-action human\nmotion synthesis. The system, dubbed as NEURAL MARIONETTE, can produce\nhigh-quality and meaningful motions with smooth transitions from simple user\ninput, including a sequence of action tags with expected action duration, and\noptionally a hand-drawn moving trajectory if the user specifies. The core of\nour system is a novel Transformer-based motion generation model, namely\nMARIONET, which can generate diverse motions given action tags. Different from\nexisting motion generation models, MARIONET utilizes contextual information\nfrom the past motion clip and future action tag, dedicated to generating\nactions that can smoothly blend historical and future actions. Specifically,\nMARIONET first encodes target action tag and contextual information into an\naction-level latent code. The code is unfolded into frame-level control signals\nvia a time unrolling module, which could be then combined with other\nframe-level control signals like the target trajectory. Motion frames are then\ngenerated in an auto-regressive way. By sequentially applying MARIONET, the\nsystem NEURAL MARIONETTE can robustly generate long-term, multi-action motions\nwith the help of two simple schemes, namely \"Shadow Start\" and \"Action\nRevision\". Along with the novel system, we also present a new dataset dedicated\nto the multi-action motion synthesis task, which contains both action tags and\ntheir contextual information. Extensive experiments are conducted to study the\naction accuracy, naturalism, and transition smoothness of the motions generated\nby our system.\n","authors":["Weiqiang Wang","Xuefei Zhe","Qiuhong Ke","Di Kang","Tingguang Li","Ruizhi Chen","Linchao Bao"],"pdf_url":"https://arxiv.org/pdf/2209.13204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15908v1","updated":"2023-11-27T15:14:38Z","published":"2023-11-27T15:14:38Z","title":"Enhancing Perceptual Quality in Video Super-Resolution through\n Temporally-Consistent Detail Synthesis using Diffusion Models","summary":" In this paper, we address the problem of video super-resolution (VSR) using\nDiffusion Models (DM), and present StableVSR. Our method significantly enhances\nthe perceptual quality of upscaled videos by synthesizing realistic and\ntemporally-consistent details. We turn a pre-trained DM for single image\nsuper-resolution into a VSR method by introducing the Temporal Conditioning\nModule (TCM). TCM uses Temporal Texture Guidance, which provides\nspatially-aligned and detail-rich texture information synthesized in adjacent\nframes. This guides the generative process of the current frame toward\nhigh-quality and temporally-consistent results. We introduce a Frame-wise\nBidirectional Sampling strategy to encourage the use of information from past\nto future and vice-versa. This strategy improves the perceptual quality of the\nresults and the temporal consistency across frames. We demonstrate the\neffectiveness of StableVSR in enhancing the perceptual quality of upscaled\nvideos compared to existing state-of-the-art methods for VSR. The code is\navailable at https://github.com/claudiom4sir/StableVSR.\n","authors":["Claudio Rota","Marco Buzzelli","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2311.15908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15906v1","updated":"2023-11-27T15:13:02Z","published":"2023-11-27T15:13:02Z","title":"MetaDefa: Meta-learning based on Domain Enhancement and Feature\n Alignment for Single Domain Generalization","summary":" The single domain generalization(SDG) based on meta-learning has emerged as\nan effective technique for solving the domain-shift problem. However, the\ninadequate match of data distribution between source and augmented domains and\ndifficult separation of domain-invariant features from domain-related features\nmake SDG model hard to achieve great generalization. Therefore, a novel\nmeta-learning method based on domain enhancement and feature alignment\n(MetaDefa) is proposed to improve the model generalization performance. First,\nthe background substitution and visual corruptions techniques are used to\ngenerate diverse and effective augmented domains. Then, the multi-channel\nfeature alignment module based on class activation maps and class agnostic\nactivation maps is designed to effectively extract adequate transferability\nknowledge. In this module, domain-invariant features can be fully explored by\nfocusing on similar target regions between source and augmented domains feature\nspace and suppressing the feature representation of non-similar target regions.\nExtensive experiments on two publicly available datasets show that MetaDefa has\nsignificant generalization performance advantages in unknown multiple target\ndomains.\n","authors":["Can Sun","Hao Zheng","Zhigang Hu","Liu Yang","Meiguang Zheng","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2311.15906v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.05697v2","updated":"2023-11-27T15:08:03Z","published":"2023-11-09T19:10:28Z","title":"3DGAUnet: 3D generative adversarial networks with a 3D U-Net based\n generator to achieve the accurate and effective synthesis of clinical tumor\n image data for pancreatic cancer","summary":" Pancreatic ductal adenocarcinoma (PDAC) presents a critical global health\nchallenge, and early detection is crucial for improving the 5-year survival\nrate. Recent medical imaging and computational algorithm advances offer\npotential solutions for early diagnosis. Deep learning, particularly in the\nform of convolutional neural networks (CNNs), has demonstrated success in\nmedical image analysis tasks, including classification and segmentation.\nHowever, the limited availability of clinical data for training purposes\ncontinues to provide a significant obstacle. Data augmentation, generative\nadversarial networks (GANs), and cross-validation are potential techniques to\naddress this limitation and improve model performance, but effective solutions\nare still rare for 3D PDAC, where contrast is especially poor owing to the high\nheterogeneity in both tumor and background tissues. In this study, we developed\na new GAN-based model, named 3DGAUnet, for generating realistic 3D CT images of\nPDAC tumors and pancreatic tissue, which can generate the interslice connection\ndata that the existing 2D CT image synthesis models lack. Our innovation is to\ndevelop a 3D U-Net architecture for the generator to improve shape and texture\nlearning for PDAC tumors and pancreatic tissue. Our approach offers a promising\npath to tackle the urgent requirement for creative and synergistic methods to\ncombat PDAC. The development of this GAN-based model has the potential to\nalleviate data scarcity issues, elevate the quality of synthesized data, and\nthereby facilitate the progression of deep learning models to enhance the\naccuracy and early detection of PDAC tumors, which could profoundly impact\npatient outcomes. Furthermore, this model has the potential to be adapted to\nother types of solid tumors, hence making significant contributions to the\nfield of medical imaging in terms of image processing models.\n","authors":["Yu Shi","Hannah Tang","Michael Baine","Michael A. Hollingsworth","Huijing Du","Dandan Zheng","Chi Zhang","Hongfeng Yu"],"pdf_url":"https://arxiv.org/pdf/2311.05697v2.pdf","comment":"Published on Cancers: Shi, Yu, Hannah Tang, Michael J. Baine, Michael\n A. Hollingsworth, Huijing Du, Dandan Zheng, Chi Zhang, and Hongfeng Yu. 2023.\n \"3DGAUnet: 3D Generative Adversarial Networks with a 3D U-Net Based Generator\n to Achieve the Accurate and Effective Synthesis of Clinical Tumor Image Data\n for Pancreatic Cancer\" Cancers 15, no. 23: 5496"},{"id":"http://arxiv.org/abs/2311.15896v1","updated":"2023-11-27T15:01:26Z","published":"2023-11-27T15:01:26Z","title":"Data Generation for Post-OCR correction of Cyrillic handwriting","summary":" This paper introduces a novel approach to post-Optical Character Recognition\nCorrection (POC) for handwritten Cyrillic text, addressing a significant gap in\ncurrent research methodologies. This gap is due to the lack of large text\ncorporas that provide OCR errors for further training of language-based POC\nmodels, which are demanding in terms of corpora size. Our study primarily\nfocuses on the development and application of a synthetic handwriting\ngeneration engine based on B\\'ezier curves. Such an engine generates highly\nrealistic handwritten text in any amounts, which we utilize to create a\nsubstantial dataset by transforming Russian text corpora sourced from the\ninternet. We apply a Handwritten Text Recognition (HTR) model to this dataset\nto identify OCR errors, forming the basis for our POC model training. The\ncorrection model is trained on a 90-symbol input context, utilizing a\npre-trained T5 architecture with a seq2seq correction task. We evaluate our\napproach on HWR200 and School_notebooks_RU datasets as they provide significant\nchallenges in the HTR domain. Furthermore, POC can be used to highlight errors\nfor teachers, evaluating student performance. This can be done simply by\ncomparing sentences before and after correction, displaying differences in\ntext. Our primary contribution lies in the innovative use of B\\'ezier curves\nfor Cyrillic text generation and subsequent error correction using a\nspecialized POC model. We validate our approach by presenting Word Accuracy\nRate (WAR) and Character Accuracy Rate (CAR) results, both with and without\npost-OCR correction, using real open corporas of handwritten Cyrillic text.\nThese results, coupled with our methodology, are designed to be reproducible,\npaving the way for further advancements in the field of OCR and handwritten\ntext analysis. Paper contributions can be found in\nhttps://github.com/dbrainio/CyrillicHandwritingPOC\n","authors":["Evgenii Davydkin","Aleksandr Markelov","Egor Iuldashev","Anton Dudkin","Ivan Krivorotov"],"pdf_url":"https://arxiv.org/pdf/2311.15896v1.pdf","comment":"17 pages, 27 figures, 6 tables, 26 references"},{"id":"http://arxiv.org/abs/2311.15890v1","updated":"2023-11-27T14:56:47Z","published":"2023-11-27T14:56:47Z","title":"Stability-Informed Initialization of Neural Ordinary Differential\n Equations","summary":" This paper addresses the training of Neural Ordinary Differential Equations\n(neural ODEs), and in particular explores the interplay between numerical\nintegration techniques, stability regions, step size, and initialization\ntechniques. It is shown how the choice of integration technique implicitly\nregularizes the learned model, and how the solver's corresponding stability\nregion affects training and prediction performance. From this analysis, a\nstability-informed parameter initialization technique is introduced. The\neffectiveness of the initialization method is displayed across several learning\nbenchmarks and industrial applications.\n","authors":["Theodor Westny","Arman Mohammadi","Daniel Jung","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2311.15890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15879v1","updated":"2023-11-27T14:51:37Z","published":"2023-11-27T14:51:37Z","title":"EVCap: Retrieval-Augmented Image Captioning with External Visual-Name\n Memory for Open-World Comprehension","summary":" Large language models (LLMs)-based image captioning has the capability of\ndescribing objects not explicitly observed in training data; yet novel objects\noccur frequently, necessitating the requirement of sustaining up-to-date object\nknowledge for open-world comprehension. Instead of relying on large amounts of\ndata and scaling up network parameters, we introduce a highly effective\nretrieval-augmented image captioning method that prompts LLMs with object names\nretrieved from External Visual--name memory (EVCap). We build ever-changing\nobject knowledge memory using objects' visuals and names, enabling us to (i)\nupdate the memory at a minimal cost and (ii) effortlessly augment LLMs with\nretrieved object names utilizing a lightweight and fast-to-train model. Our\nmodel, which was trained only on the COCO dataset, can be adapted to out-domain\ndata without additional fine-tuning or retraining. Our comprehensive\nexperiments conducted on various benchmarks and synthetic commonsense-violating\ndata demonstrate that EVCap, comprising solely 3.97M trainable parameters,\nexhibits superior performance compared to other methods of equivalent model\nsize scale. Notably, it achieves competitive performance against specialist\nSOTAs with an enormous number of parameters. Our code is available at\nhttps://jiaxuan-li.github.io/EVCap.\n","authors":["Jiaxuan Li","Duc Minh Vo","Akihiro Sugimoto","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2311.15879v1.pdf","comment":"Project page: https://jiaxuan-li.github.io/EVCap"},{"id":"http://arxiv.org/abs/2311.15876v1","updated":"2023-11-27T14:49:06Z","published":"2023-11-27T14:49:06Z","title":"RO-LLaMA: Generalist LLM for Radiation Oncology via Noise Augmentation\n and Consistency Regularization","summary":" Recent advancements in Artificial Intelligence (AI) have profoundly\ninfluenced medical fields, by providing tools to reduce clinical workloads.\nHowever, most AI models are constrained to execute uni-modal tasks, in stark\ncontrast to the comprehensive approaches utilized by medical professionals. To\naddress this, here we present RO-LLaMA, a versatile generalist large language\nmodel (LLM) tailored for the field of radiation oncology. This model seamlessly\ncovers a wide range of the workflow of radiation oncologists, adept at various\ntasks such as clinical report summarization, radiation therapy plan suggestion,\nand plan-guided therapy target volume segmentation. In particular, to maximize\nthe end-to-end performance, we further present a novel Consistency Embedding\nFine-Tuning (CEFTune) technique, which boosts LLM's robustness to additional\nerrors at the intermediates while preserving the capability of handling clean\ninputs, and creatively transform this concept into LLM-driven segmentation\nframework as Consistency Embedding Segmentation (CESEG). Experimental results\non multi-centre cohort sets demonstrate our proposed RO-LLaMA's promising\nperformance for diverse tasks with generalization capabilities.\n","authors":["Kwanyoung Kim","Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Jin Sung Kim","Yong Bae Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00349v2","updated":"2023-11-27T14:42:52Z","published":"2023-06-01T05:06:56Z","title":"CALICO: Self-Supervised Camera-LiDAR Contrastive Pre-training for BEV\n Perception","summary":" Perception is crucial in the realm of autonomous driving systems, where\nbird's eye view (BEV)-based architectures have recently reached\nstate-of-the-art performance. The desirability of self-supervised\nrepresentation learning stems from the expensive and laborious process of\nannotating 2D and 3D data. Although previous research has investigated\npretraining methods for both LiDAR and camera-based 3D object detection, a\nunified pretraining framework for multimodal BEV perception is missing. In this\nstudy, we introduce CALICO, a novel framework that applies contrastive\nobjectives to both LiDAR and camera backbones. Specifically, CALICO\nincorporates two stages: point-region contrast (PRC) and region-aware\ndistillation (RAD). PRC better balances the region- and scene-level\nrepresentation learning on the LiDAR modality and offers significant\nperformance improvement compared to existing methods. RAD effectively achieves\ncontrastive distillation on our self-trained teacher model. CALICO's efficacy\nis substantiated by extensive evaluations on 3D object detection and BEV map\nsegmentation tasks, where it delivers significant performance improvements.\nNotably, CALICO outperforms the baseline method by 10.5% and 8.6% on NDS and\nmAP. Moreover, CALICO boosts the robustness of multimodal 3D object detection\nagainst adversarial attacks and corruption. Additionally, our framework can be\ntailored to different backbones and heads, positioning it as a promising\napproach for multimodal BEV perception.\n","authors":["Jiachen Sun","Haizhong Zheng","Qingzhao Zhang","Atul Prakash","Z. Morley Mao","Chaowei Xiao"],"pdf_url":"https://arxiv.org/pdf/2306.00349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15864v1","updated":"2023-11-27T14:32:33Z","published":"2023-11-27T14:32:33Z","title":"InterControl: Generate Human Motion Interactions by Controlling Every\n Joint","summary":" Text-conditioned human motion generation model has achieved great progress by\nintroducing diffusion models and corresponding control signals. However, the\ninteraction between humans are still under explored. To model interactions of\narbitrary number of humans, we define interactions as human joint pairs that\nare either in contact or separated, and leverage {\\em Large Language Model\n(LLM) Planner} to translate interaction descriptions into contact plans. Based\non the contact plans, interaction generation could be achieved by spatially\ncontrollable motion generation methods by taking joint contacts as spatial\nconditions. We present a novel approach named InterControl for flexible spatial\ncontrol of every joint in every person at any time by leveraging motion\ndiffusion model only trained on single-person data. We incorporate a motion\ncontrolnet to generate coherent and realistic motions given sparse spatial\ncontrol signals and a loss guidance module to precisely align any joint to the\ndesired position in a classifier guidance manner via Inverse Kinematics (IK).\nExtensive experiments on HumanML3D and KIT-ML dataset demonstrate its\neffectiveness in versatile joint control. We also collect data of joint contact\npairs by LLMs to show InterControl's ability in human interaction generation.\n","authors":["Zhenzhi Wang","Jingbo Wang","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2311.15864v1.pdf","comment":"Generate human interactions with only single-person motion diffusion\n model via LLM generated joint contact pairs, code\n https://github.com/zhenzhiwang/intercontrol"},{"id":"http://arxiv.org/abs/2311.15856v1","updated":"2023-11-27T14:23:36Z","published":"2023-11-27T14:23:36Z","title":"JSSL: Joint Supervised and Self-supervised Learning for MRI\n Reconstruction","summary":" Magnetic Resonance Imaging represents an important diagnostic modality;\nhowever, its inherently slow acquisition process poses challenges in obtaining\nfully sampled k-space data under motion in clinical scenarios such as\nabdominal, cardiac, and prostate imaging. In the absence of fully sampled\nacquisitions, which can serve as ground truth data, training deep learning\nalgorithms in a supervised manner to predict the underlying ground truth image\nbecomes an impossible task. To address this limitation, self-supervised methods\nhave emerged as a viable alternative, leveraging available subsampled k-space\ndata to train deep learning networks for MRI reconstruction. Nevertheless,\nthese self-supervised approaches often fall short when compared to supervised\nmethodologies. In this paper, we introduce JSSL (Joint Supervised and\nSelf-supervised Learning), a novel training approach for deep learning-based\nMRI reconstruction algorithms aimed at enhancing reconstruction quality in\nscenarios where target dataset(s) containing fully sampled k-space measurements\nare unavailable. Our proposed method operates by simultaneously training a\nmodel in a self-supervised learning setting, using subsampled data from the\ntarget dataset(s), and in a supervised learning manner, utilizing data from\nother datasets, referred to as proxy datasets, where fully sampled k-space data\nis accessible. To demonstrate the efficacy of JSSL, we utilized subsampled\nprostate parallel MRI measurements as the target dataset, while employing fully\nsampled brain and knee k-space acquisitions as proxy datasets. Our results\nshowcase a substantial improvement over conventional self-supervised training\nmethods, thereby underscoring the effectiveness of our joint approach. We\nprovide a theoretical motivation for JSSL and establish a practical\n\"rule-of-thumb\" for selecting the most appropriate training approach for deep\nMRI reconstruction.\n","authors":["George Yiasemis","Nikita Moriakov","Clara I. Sánchez","Jan-Jakob Sonke","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2311.15856v1.pdf","comment":"26 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2311.15855v1","updated":"2023-11-27T14:22:07Z","published":"2023-11-27T14:22:07Z","title":"SiTH: Single-view Textured Human Reconstruction with Image-Conditioned\n Diffusion","summary":" A long-standing goal of 3D human reconstruction is to create lifelike and\nfully detailed 3D humans from single images. The main challenge lies in\ninferring unknown human shapes, clothing, and texture information in areas not\nvisible in the images. To address this, we propose SiTH, a novel pipeline that\nuniquely integrates an image-conditioned diffusion model into a 3D mesh\nreconstruction workflow. At the core of our method lies the decomposition of\nthe ill-posed single-view reconstruction problem into hallucination and\nreconstruction subproblems. For the former, we employ a powerful generative\ndiffusion model to hallucinate back appearances from the input images. For the\nlatter, we leverage skinned body meshes as guidance to recover full-body\ntexture meshes from the input and back-view images. Our designs enable training\nof the pipeline with only about 500 3D human scans while maintaining its\ngenerality and robustness. Extensive experiments and user studies on two 3D\nreconstruction benchmarks demonstrated the efficacy of our method in generating\nrealistic, fully textured 3D humans from a diverse range of unseen images.\n","authors":["Hsuan-I Ho","Jie Song","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2311.15855v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.12877v3","updated":"2023-11-27T14:22:05Z","published":"2023-10-19T16:32:18Z","title":"Perceptual Assessment and Optimization of High Dynamic Range Image\n Rendering","summary":" The increasing popularity of high dynamic range (HDR) imaging stems from its\nability to faithfully capture luminance levels in natural scenes. However, HDR\nimage quality assessment has been insufficiently addressed. Existing models are\nmostly designed for low dynamic range (LDR) images, which exhibit poorly\ncorrelated with human perception of HDR image quality. To fill this gap, we\npropose a family of HDR quality metrics by transferring the recent advancements\nin LDR domain. The key step in our approach is to employ a simple inverse\ndisplay model to decompose an HDR image into a stack of LDR images with varying\nexposures. Subsequently, these LDR images are evaluated using state-of-the-art\nLDR quality metrics. Our family of HDR quality models offer three notable\nadvantages. First, specific exposures (i.e., luminance ranges) can be weighted\nto emphasize their assessment when calculating the overall quality score.\nSecond, our HDR quality metrics directly inherit the capabilities of their base\nLDR quality models in assessing LDR images. Third, our metrics do not rely on\nhuman perceptual data of HDR image quality for re-calibration. Experiments\nconducted on four human-rated HDR image quality datasets indicate that our HDR\nquality metrics consistently outperform existing methods, including the HDR-VDP\nfamily. Furthermore, we demonstrate the promise of our models in the perceptual\noptimization of HDR novel view synthesis.\n","authors":["Peibei Cao","Rafal K. Mantiuk","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2310.12877v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15851v1","updated":"2023-11-27T14:17:41Z","published":"2023-11-27T14:17:41Z","title":"Single-Model and Any-Modality for Video Object Tracking","summary":" In the realm of video object tracking, auxiliary modalities such as depth,\nthermal, or event data have emerged as valuable assets to complement the RGB\ntrackers. In practice, most existing RGB trackers learn a single set of\nparameters to use them across datasets and applications. However, a similar\nsingle-model unification for multi-modality tracking presents several\nchallenges. These challenges stem from the inherent heterogeneity of inputs --\neach with modality-specific representations, the scarcity of multi-modal\ndatasets, and the absence of all the modalities at all times. In this work, we\nintroduce Un-Track, a \\underline{Un}ified Tracker of a single set of parameters\nfor any modality. To handle any modality, our method learns their common latent\nspace through low-rank factorization and reconstruction techniques. More\nimportantly, we use only the RGB-X pairs to learn the common latent space. This\nunique shared representation seamlessly binds all modalities together, enabling\neffective unification and accommodating any missing modality, all within a\nsingle transformer-based architecture and without the need for\nmodality-specific fine-tuning. Our Un-Track achieves +8.1 absolute F-score\ngain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50) GFLOPs\nwith +6.6M (over 93M) parameters, through a simple yet efficient prompting\nstrategy. Extensive comparisons on five benchmark datasets with different\nmodalities show that Un-Track surpasses both SOTA unified trackers and\nmodality-specific finetuned counterparts, validating our effectiveness and\npracticality.\n","authors":["Zongwei Wu","Jilai Zheng","Xiangxuan Ren","Florin-Alexandru Vasluianu","Chao Ma","Danda Pani Paudel","Luc Van Gool","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2311.15851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15847v1","updated":"2023-11-27T14:12:51Z","published":"2023-11-27T14:12:51Z","title":"Cell Maps Representation For Lung Adenocarcinoma Growth Patterns\n Classification In Whole Slide Images","summary":" Lung adenocarcinoma is a morphologically heterogeneous disease, characterized\nby five primary histologic growth patterns. The quantity of these patterns can\nbe related to tumor behavior and has a significant impact on patient prognosis.\nIn this work, we propose a novel machine learning pipeline capable of\nclassifying tissue tiles into one of the five patterns or as non-tumor, with an\nArea Under the Receiver Operating Characteristic Curve (AUCROC) score of 0.97.\nOur model's strength lies in its comprehensive consideration of cellular\nspatial patterns, where it first generates cell maps from Hematoxylin and Eosin\n(H&E) whole slide images (WSIs), which are then fed into a convolutional neural\nnetwork classification model. Exploiting these cell maps provides the model\nwith robust generalizability to new data, achieving approximately 30% higher\naccuracy on unseen test-sets compared to current state of the art approaches.\nThe insights derived from our model can be used to predict prognosis, enhancing\npatient outcomes.\n","authors":["Arwa Al-Rubaian","Gozde N. Gunesli","Wajd A. Althakfi","Ayesha Azam","Nasir Rajpoot","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2311.15847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15846v1","updated":"2023-11-27T14:11:54Z","published":"2023-11-27T14:11:54Z","title":"Learning with Noisy Low-Cost MOS for Image Quality Assessment via\n Dual-Bias Calibration","summary":" Learning based image quality assessment (IQA) models have obtained impressive\nperformance with the help of reliable subjective quality labels, where mean\nopinion score (MOS) is the most popular choice. However, in view of the\nsubjective bias of individual annotators, the labor-abundant MOS (LA-MOS)\ntypically requires a large collection of opinion scores from multiple\nannotators for each image, which significantly increases the learning cost. In\nthis paper, we aim to learn robust IQA models from low-cost MOS (LC-MOS), which\nonly requires very few opinion scores or even a single opinion score for each\nimage. More specifically, we consider the LC-MOS as the noisy observation of\nLA-MOS and enforce the IQA model learned from LC-MOS to approach the unbiased\nestimation of LA-MOS. In this way, we represent the subjective bias between\nLC-MOS and LA-MOS, and the model bias between IQA predictions learned from\nLC-MOS and LA-MOS (i.e., dual-bias) as two latent variables with unknown\nparameters. By means of the expectation-maximization based alternating\noptimization, we can jointly estimate the parameters of the dual-bias, which\nsuppresses the misleading of LC-MOS via a gated dual-bias calibration (GDBC)\nmodule. To the best of our knowledge, this is the first exploration of robust\nIQA model learning from noisy low-cost labels. Theoretical analysis and\nextensive experiments on four popular IQA datasets show that the proposed\nmethod is robust toward different bias rates and annotation numbers and\nsignificantly outperforms the other learning based IQA models when only LC-MOS\nis available. Furthermore, we also achieve comparable performance with respect\nto the other models learned with LA-MOS.\n","authors":["Lei Wang","Qingbo Wu","Desen Yuan","King Ngi Ngan","Hongliang Li","Fanman Meng","Linfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2311.15846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15841v1","updated":"2023-11-27T14:07:13Z","published":"2023-11-27T14:07:13Z","title":"Learning Disentangled Identifiers for Action-Customized Text-to-Image\n Generation","summary":" This study focuses on a novel task in text-to-image (T2I) generation, namely\naction customization. The objective of this task is to learn the co-existing\naction from limited data and generalize it to unseen humans or even animals.\nExperimental results show that existing subject-driven customization methods\nfail to learn the representative characteristics of actions and struggle in\ndecoupling actions from context features, including appearance. To overcome the\npreference for low-level features and the entanglement of high-level features,\nwe propose an inversion-based method Action-Disentangled Identifier (ADI) to\nlearn action-specific identifiers from the exemplar images. ADI first expands\nthe semantic conditioning space by introducing layer-wise identifier tokens,\nthereby increasing the representational richness while distributing the\ninversion across different features. Then, to block the inversion of\naction-agnostic features, ADI extracts the gradient invariance from the\nconstructed sample triples and masks the updates of irrelevant channels. To\ncomprehensively evaluate the task, we present an ActionBench that includes a\nvariety of actions, each accompanied by meticulously selected samples. Both\nquantitative and qualitative results show that our ADI outperforms existing\nbaselines in action-customized T2I generation.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Xi Chen","Yuqian Fu","Yu Liu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15836v1","updated":"2023-11-27T13:59:53Z","published":"2023-11-27T13:59:53Z","title":"Syn3DWound: A Synthetic Dataset for 3D Wound Bed Analysis","summary":" Wound management poses a significant challenge, particularly for bedridden\npatients and the elderly. Accurate diagnostic and healing monitoring can\nsignificantly benefit from modern image analysis, providing accurate and\nprecise measurements of wounds. Despite several existing techniques, the\nshortage of expansive and diverse training datasets remains a significant\nobstacle to constructing machine learning-based frameworks. This paper\nintroduces Syn3DWound, an open-source dataset of high-fidelity simulated wounds\nwith 2D and 3D annotations. We propose baseline methods and a benchmarking\nframework for automated 3D morphometry analysis and 2D/3D wound segmentation.\n","authors":["Léo Lebrat","Rodrigo Santa Cruz","Remi Chierchia","Yulia Arzhaeva","Mohammad Ali Armin","Joshua Goldsmith","Jeremy Oorloff","Prithvi Reddy","Chuong Nguyen","Lars Petersson","Michelle Barakat-Johnson","Georgina Luscombe","Clinton Fookes","Olivier Salvado","David Ahmedt-Aristizabal"],"pdf_url":"https://arxiv.org/pdf/2311.15836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15830v1","updated":"2023-11-27T13:53:53Z","published":"2023-11-27T13:53:53Z","title":"A-JEPA: Joint-Embedding Predictive Architecture Can Listen","summary":" This paper presents that the masked-modeling principle driving the success of\nlarge foundational vision models can be effectively applied to audio by making\npredictions in a latent space. We introduce Audio-based Joint-Embedding\nPredictive Architecture (A-JEPA), a simple extension method for self-supervised\nlearning from the audio spectrum. Following the design of I-JPEA, our A-JEPA\nencodes visible audio spectrogram patches with a curriculum masking strategy\nvia context encoder, and predicts the representations of regions sampled at\nwell-designed locations. The target representations of those regions are\nextracted by the exponential moving average of context encoder, \\emph{i.e.},\ntarget encoder, on the whole spectrogram. We find it beneficial to transfer\nrandom block masking into time-frequency aware masking in a curriculum manner,\nconsidering the complexity of highly correlated in local time and frequency in\naudio spectrograms. To enhance contextual semantic understanding and\nrobustness, we fine-tune the encoder with a regularized masking on target\ndatasets, instead of input dropping or zero. Empirically, when built with\nVision Transformers structure, we find A-JEPA to be highly scalable and sets\nnew state-of-the-art performance on multiple audio and speech classification\ntasks, outperforming other recent models that use externally supervised\npre-training.\n","authors":["Zhengcong Fei","Mingyuan Fan","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2311.15830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14084v2","updated":"2023-11-27T13:43:19Z","published":"2023-11-23T16:22:58Z","title":"AI-Generated Images Introduce Invisible Relevance Bias to Text-Image\n Retrieval","summary":" With the advancement of generation models, AI-generated content (AIGC) is\nbecoming more realistic, flooding the Internet. A recent study suggests that\nthis phenomenon has elevated the issue of source bias in text retrieval for web\nsearches. Specifically, neural retrieval models tend to rank generated texts\nhigher than human-written texts. In this paper, we extend the study of this\nbias to cross-modal retrieval. Firstly, we successfully construct a suitable\nbenchmark to explore the existence of the bias. Subsequent extensive\nexperiments on this benchmark reveal that AI-generated images introduce an\ninvisible relevance bias to text-image retrieval models. Specifically, our\nexperiments show that text-image retrieval models tend to rank the AI-generated\nimages higher than the real images, even though the AI-generated images do not\nexhibit more visually relevant features to the query than real images. This\ninvisible relevance bias is prevalent across retrieval models with varying\ntraining data and architectures. Furthermore, our subsequent exploration\nreveals that the inclusion of AI-generated images in the training data of the\nretrieval models exacerbates the invisible relevance bias. The above phenomenon\ntriggers a vicious cycle, which makes the invisible relevance bias become more\nand more serious. To elucidate the potential causes of invisible relevance and\naddress the aforementioned issues, we introduce an effective training method\naimed at alleviating the invisible relevance bias. Subsequently, we apply our\nproposed debiasing method to retroactively identify the causes of invisible\nrelevance, revealing that the AI-generated images induce the image encoder to\nembed additional information into their representation. This information\nexhibits a certain consistency across generated images with different semantics\nand can make the retriever estimate a higher relevance score.\n","authors":["Shicheng Xu","Danyang Hou","Liang Pang","Jingcheng Deng","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.14084v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2311.15813v1","updated":"2023-11-27T13:39:44Z","published":"2023-11-27T13:39:44Z","title":"FlowZero: Zero-Shot Text-to-Video Synthesis with LLM-Driven Dynamic\n Scene Syntax","summary":" Text-to-video (T2V) generation is a rapidly growing research area that aims\nto translate the scenes, objects, and actions within complex video text into a\nsequence of coherent visual frames. We present FlowZero, a novel framework that\ncombines Large Language Models (LLMs) with image diffusion models to generate\ntemporally-coherent videos. FlowZero uses LLMs to understand complex\nspatio-temporal dynamics from text, where LLMs can generate a comprehensive\ndynamic scene syntax (DSS) containing scene descriptions, object layouts, and\nbackground motion patterns. These elements in DSS are then used to guide the\nimage diffusion model for video generation with smooth object motions and\nframe-to-frame coherence. Moreover, FlowZero incorporates an iterative\nself-refinement process, enhancing the alignment between the spatio-temporal\nlayouts and the textual prompts for the videos. To enhance global coherence, we\npropose enriching the initial noise of each frame with motion dynamics to\ncontrol the background movement and camera motion adaptively. By using\nspatio-temporal syntaxes to guide the diffusion process, FlowZero achieves\nimprovement in zero-shot video synthesis, generating coherent videos with vivid\nmotion.\n","authors":["Yu Lu","Linchao Zhu","Hehe Fan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15813v1.pdf","comment":"Project page: https://flowzero-video.github.io"},{"id":"http://arxiv.org/abs/2310.12190v2","updated":"2023-11-27T13:36:04Z","published":"2023-10-18T14:42:16Z","title":"DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors","summary":" Animating a still image offers an engaging visual experience. Traditional\nimage animation techniques mainly focus on animating natural scenes with\nstochastic dynamics (e.g. clouds and fluid) or domain-specific motions (e.g.\nhuman hair or body motions), and thus limits their applicability to more\ngeneral visual content. To overcome this limitation, we explore the synthesis\nof dynamic content for open-domain images, converting them into animated\nvideos. The key idea is to utilize the motion prior of text-to-video diffusion\nmodels by incorporating the image into the generative process as guidance.\nGiven an image, we first project it into a text-aligned rich context\nrepresentation space using a query transformer, which facilitates the video\nmodel to digest the image content in a compatible fashion. However, some visual\ndetails still struggle to be preserved in the resultant videos. To supplement\nwith more precise image information, we further feed the full image to the\ndiffusion model by concatenating it with the initial noises. Experimental\nresults show that our proposed method can produce visually convincing and more\nlogical & natural motions, as well as higher conformity to the input image.\nComparative evaluation demonstrates the notable superiority of our approach\nover existing competitors.\n","authors":["Jinbo Xing","Menghan Xia","Yong Zhang","Haoxin Chen","Wangbo Yu","Hanyuan Liu","Xintao Wang","Tien-Tsin Wong","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2310.12190v2.pdf","comment":"Project page: https://doubiiu.github.io/projects/DynamiCrafter"},{"id":"http://arxiv.org/abs/2311.15812v1","updated":"2023-11-27T13:35:20Z","published":"2023-11-27T13:35:20Z","title":"C-SAW: Self-Supervised Prompt Learning for Image Generalization in\n Remote Sensing","summary":" We focus on domain and class generalization problems in analyzing optical\nremote sensing images, using the large-scale pre-trained vision-language model\n(VLM), CLIP. While contrastively trained VLMs show impressive zero-shot\ngeneralization performance, their effectiveness is limited when dealing with\ndiverse domains during training and testing. Existing prompt learning\ntechniques overlook the importance of incorporating domain and content\ninformation into the prompts, which results in a drop in performance while\ndealing with such multi-domain data. To address these challenges, we propose a\nsolution that ensures domain-invariant prompt learning while enhancing the\nexpressiveness of visual features. We observe that CLIP's vision encoder\nstruggles to identify contextual image information, particularly when image\npatches are jumbled up. This issue is especially severe in optical remote\nsensing images, where land-cover classes exhibit well-defined contextual\nappearances. To this end, we introduce C-SAW, a method that complements CLIP\nwith a self-supervised loss in the visual space and a novel prompt learning\ntechnique that emphasizes both visual domain and content-specific features. We\nkeep the CLIP backbone frozen and introduce a small set of projectors for both\nthe CLIP encoders to train C-SAW contrastively. Experimental results\ndemonstrate the superiority of C-SAW across multiple remote sensing benchmarks\nand different generalization tasks.\n","authors":["Avigyan Bhattacharya","Mainak Singha","Ankit Jha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2311.15812v1.pdf","comment":"Accepted in ACM ICVGIP 2023"},{"id":"http://arxiv.org/abs/2311.15806v1","updated":"2023-11-27T13:29:34Z","published":"2023-11-27T13:29:34Z","title":"PIPE : Parallelized Inference Through Post-Training Quantization\n Ensembling of Residual Expansions","summary":" Deep neural networks (DNNs) are ubiquitous in computer vision and natural\nlanguage processing, but suffer from high inference cost. This problem can be\naddressed by quantization, which consists in converting floating point\nperations into a lower bit-width format. With the growing concerns on privacy\nrights, we focus our efforts on data-free methods. However, such techniques\nsuffer from their lack of adaptability to the target devices, as a hardware\ntypically only support specific bit widths. Thus, to adapt to a variety of\ndevices, a quantization method shall be flexible enough to find good accuracy\nv.s. speed trade-offs for every bit width and target device. To achieve this,\nwe propose PIPE, a quantization method that leverages residual error expansion,\nalong with group sparsity and an ensemble approximation for better\nparallelization. PIPE is backed off by strong theoretical guarantees and\nachieves superior performance on every benchmarked application (from vision to\nNLP tasks), architecture (ConvNets, transformers) and bit-width (from int8 to\nternary quantization).\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2311.15806v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2203.14645"},{"id":"http://arxiv.org/abs/2311.15803v1","updated":"2023-11-27T13:25:47Z","published":"2023-11-27T13:25:47Z","title":"SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using\n Neural Radiance Fields","summary":" In rapidly-evolving domains such as autonomous driving, the use of multiple\nsensors with different modalities is crucial to ensure high operational\nprecision and stability. To correctly exploit the provided information by each\nsensor in a single common frame, it is essential for these sensors to be\naccurately calibrated. In this paper, we leverage the ability of Neural\nRadiance Fields (NeRF) to represent different sensors modalities in a common\nvolumetric representation to achieve robust and accurate spatio-temporal sensor\ncalibration. By designing a partitioning approach based on the visible part of\nthe scene for each sensor, we formulate the calibration problem using only the\noverlapping areas. This strategy results in a more robust and accurate\ncalibration that is less prone to failure. We demonstrate that our approach\nworks on outdoor urban scenes by validating it on multiple established driving\ndatasets. Results show that our method is able to get better accuracy and\nrobustness compared to existing methods.\n","authors":["Quentin Herau","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2311.15803v1.pdf","comment":"Paper + Supplementary, under review"},{"id":"http://arxiv.org/abs/2209.07042v5","updated":"2023-11-27T13:18:28Z","published":"2022-09-15T04:51:17Z","title":"Efficient Perception, Planning, and Control Algorithms for Vision-Based\n Automated Vehicles","summary":" Autonomous vehicles have limited computational resources; hence, their\ncontrol systems must be efficient. The cost and size of sensors have limited\nthe development of self-driving cars. To overcome these restrictions, this\nstudy proposes an efficient framework for the operation of vision-based\nautomatic vehicles; the framework requires only a monocular camera and a few\ninexpensive radars. The proposed algorithm comprises a multi-task UNet (MTUNet)\nnetwork for extracting image features and constrained iterative linear\nquadratic regulator (CILQR) and vision predictive control (VPC) modules for\nrapid motion planning and control. MTUNet is designed to simultaneously solve\nlane line segmentation, the ego vehicle's heading angle regression, road type\nclassification, and traffic object detection tasks at approximately 40 FPS\n(frames per second) for 228 x 228 pixel RGB input images. The CILQR controllers\nthen use the MTUNet outputs and radar data as inputs to produce driving\ncommands for lateral and longitudinal vehicle guidance within only 1 ms. In\nparticular, the VPC algorithm is included to reduce steering command latency to\nbelow actuator latency to prevent self-driving vehicle performance degradation\nduring tight turns. The VPC algorithm uses road curvature data from MTUNet to\nestimate the correction of the current steering angle at a look-ahead point to\nadjust the turning amount. Including the VPC algorithm in a VPC-CILQR\ncontroller on curvy roads leads to higher performance than CILQR alone. Our\nexperiments demonstrate that the proposed autonomous driving system, which does\nnot require high-definition maps, could be applied in current autonomous\nvehicles.\n","authors":["Der-Hau Lee"],"pdf_url":"https://arxiv.org/pdf/2209.07042v5.pdf","comment":"10 figures, 13 pages"},{"id":"http://arxiv.org/abs/2310.03335v2","updated":"2023-11-27T13:18:11Z","published":"2023-10-05T06:35:21Z","title":"Continual Test-time Domain Adaptation via Dynamic Sample Selection","summary":" The objective of Continual Test-time Domain Adaptation (CTDA) is to gradually\nadapt a pre-trained model to a sequence of target domains without accessing the\nsource data. This paper proposes a Dynamic Sample Selection (DSS) method for\nCTDA. DSS consists of dynamic thresholding, positive learning, and negative\nlearning processes. Traditionally, models learn from unlabeled unknown\nenvironment data and equally rely on all samples' pseudo-labels to update their\nparameters through self-training. However, noisy predictions exist in these\npseudo-labels, so all samples are not equally trustworthy. Therefore, in our\nmethod, a dynamic thresholding module is first designed to select suspected\nlow-quality from high-quality samples. The selected low-quality samples are\nmore likely to be wrongly predicted. Therefore, we apply joint positive and\nnegative learning on both high- and low-quality samples to reduce the risk of\nusing wrong information. We conduct extensive experiments that demonstrate the\neffectiveness of our proposed method for CTDA in the image domain,\noutperforming the state-of-the-art results. Furthermore, our approach is also\nevaluated in the 3D point cloud domain, showcasing its versatility and\npotential for broader applicability.\n","authors":["Yanshuo Wang","Jie Hong","Ali Cheraghian","Shafin Rahman","David Ahmedt-Aristizabal","Lars Petersson","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2310.03335v2.pdf","comment":"2024 IEEE/CVF Winter Conference on Applications of Computer Vision"},{"id":"http://arxiv.org/abs/2304.02970v4","updated":"2023-11-27T13:11:20Z","published":"2023-04-06T09:54:06Z","title":"A Closer Look at Audio-Visual Segmentation","summary":" Audio-visual segmentation (AVS) is a complex task that involves accurately\nsegmenting the corresponding sounding object based on audio-visual queries.\nSuccessful audio-visual learning requires two essential components: 1) an\nunbiased dataset with high-quality pixel-level multi-class labels, and 2) a\nmodel capable of effectively linking audio information with its corresponding\nvisual object. However, these two requirements are only partially addressed by\ncurrent methods, with training sets containing biased audio-visual data, and\nmodels that generalise poorly beyond this biased training set. In this work, we\npropose a new strategy to build cost-effective and relatively unbiased\naudio-visual semantic segmentation benchmarks. Our strategy, called Visual\nPost-production (VPO), explores the observation that it is not necessary to\nhave explicit audio-visual pairs extracted from single video sources to build\nsuch benchmarks. We also refine the previously proposed AVSBench to transform\nit into the audio-visual semantic segmentation benchmark AVSBench-Single+.\nFurthermore, this paper introduces a new pixel-wise audio-visual contrastive\nlearning method to enable a better generalisation of the model beyond the\ntraining set. We verify the validity of the VPO strategy by showing that\nstate-of-the-art (SOTA) models trained with datasets built by matching audio\nand visual data from different sources or with datasets containing audio and\nvisual data from the same video source produce almost the same accuracy. Then,\nusing the proposed VPO benchmarks and AVSBench-Single+, we show that our method\nproduces more accurate audio-visual semantic segmentation than SOTA models.\nCode and dataset will be available.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10594v2","updated":"2023-11-27T13:02:06Z","published":"2023-03-19T07:53:31Z","title":"AdaptGuard: Defending Against Universal Attacks for Model Adaptation","summary":" Model adaptation aims at solving the domain transfer problem under the\nconstraint of only accessing the pretrained source models. With the increasing\nconsiderations of data privacy and transmission efficiency, this paradigm has\nbeen gaining recent popularity. This paper studies the vulnerability to\nuniversal attacks transferred from the source domain during model adaptation\nalgorithms due to the existence of malicious providers. We explore both\nuniversal adversarial perturbations and backdoor attacks as loopholes on the\nsource side and discover that they still survive in the target models after\nadaptation. To address this issue, we propose a model preprocessing framework,\nnamed AdaptGuard, to improve the security of model adaptation algorithms.\nAdaptGuard avoids direct use of the risky source parameters through knowledge\ndistillation and utilizes the pseudo adversarial samples under adjusted radius\nto enhance the robustness. AdaptGuard is a plug-and-play module that requires\nneither robust pretrained models nor any changes for the following model\nadaptation algorithms. Extensive results on three commonly used datasets and\ntwo popular adaptation methods validate that AdaptGuard can effectively defend\nagainst universal attacks and maintain clean accuracy in the target domain\nsimultaneously. We hope this research will shed light on the safety and\nrobustness of transfer learning. Code is available at\nhttps://github.com/TomSheng21/AdaptGuard.\n","authors":["Lijun Sheng","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2303.10594v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2311.15782v1","updated":"2023-11-27T12:55:39Z","published":"2023-11-27T12:55:39Z","title":"Relationship between Model Compression and Adversarial Robustness: A\n Review of Current Evidence","summary":" Increasing the model capacity is a known approach to enhance the adversarial\nrobustness of deep learning networks. On the other hand, various model\ncompression techniques, including pruning and quantization, can reduce the size\nof the network while preserving its accuracy. Several recent studies have\naddressed the relationship between model compression and adversarial\nrobustness, while some experiments have reported contradictory results. This\nwork summarizes available evidence and discusses possible explanations for the\nobserved effects.\n","authors":["Svetlana Pavlitska","Hannes Grolig","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.15782v1.pdf","comment":"Accepted for publication at SSCI 2023"},{"id":"http://arxiv.org/abs/2311.15776v1","updated":"2023-11-27T12:51:42Z","published":"2023-11-27T12:51:42Z","title":"Stable Segment Anything Model","summary":" The Segment Anything Model (SAM) achieves remarkable promptable segmentation\ngiven high-quality prompts which, however, often require good skills to\nspecify. To make SAM robust to casual prompts, this paper presents the first\ncomprehensive analysis on SAM's segmentation stability across a diverse\nspectrum of prompt qualities, notably imprecise bounding boxes and insufficient\npoints. Our key finding reveals that given such low-quality prompts, SAM's mask\ndecoder tends to activate image features that are biased towards the background\nor confined to specific object parts. To mitigate this issue, our key idea\nconsists of adjusting the sampling locations of image feature using learnable\ndeformable offsets, while the original SAM model architecture and weights\nremain unchanged. Consequently, our deformable sampling plugin (DSP) enables\nSAM to adaptively shift attention to the prompted target regions in a\ndata-driven manner, facilitated by our effective robust training strategy\n(RTS). During inference, dynamic routing plugin (DRP) is proposed that toggles\nSAM between the deformable and regular grid sampling modes, conditioned on the\ninput prompt quality. Thus, our solution, termed Stable-SAM, is one of its kind\nfocusing on solely adjusting feature sampling locations, which offers several\nadvantages: 1) improved SAM's segmentation stability across a wide range of\nprompt qualities, while 2) retaining SAM's powerful promptable segmentation\nefficiency and generality, with 3) minimal learnable parameters (0.08 M) and\nfast adaptation (by 1 training epoch). Extensive experiments across multiple\ndatasets validate the effectiveness and advantages of our approach,\nunderscoring Stable-SAM as a more robust solution for segmenting anything.\nCodes will be released upon acceptance.\n","authors":["Qi Fan","Xin Tao","Lei Ke","Mingqiao Ye","Yuan Zhang","Pengfei Wan","Zhongyuan Wang","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2311.15776v1.pdf","comment":"Codes will be released upon acceptance"},{"id":"http://arxiv.org/abs/2305.19599v3","updated":"2023-11-27T12:50:09Z","published":"2023-05-31T06:59:21Z","title":"RealignDiff: Boosting Text-to-Image Diffusion Model with Coarse-to-fine\n Semantic Re-alignment","summary":" Recent advances in text-to-image diffusion models have achieved remarkable\nsuccess in generating high-quality, realistic images from textual descriptions.\nHowever, these approaches have faced challenges in precisely aligning the\ngenerated visual content with the textual concepts described in the prompts. In\nthis paper, we propose a two-stage coarse-to-fine semantic re-alignment method,\nnamed RealignDiff, aimed at improving the alignment between text and images in\ntext-to-image diffusion models. In the coarse semantic re-alignment phase, a\nnovel caption reward, leveraging the BLIP-2 model, is proposed to evaluate the\nsemantic discrepancy between the generated image caption and the given text\nprompt. Subsequently, the fine semantic re-alignment stage employs a local\ndense caption generation module and a re-weighting attention modulation module\nto refine the previously generated images from a local semantic view.\nExperimental results on the MS-COCO benchmark demonstrate that the proposed\ntwo-stage coarse-to-fine semantic re-alignment method outperforms other\nbaseline re-alignment techniques by a substantial margin in both visual quality\nand semantic similarity with the input prompt.\n","authors":["Guian Fang","Zutao Jiang","Jianhua Han","Guansong Lu","Hang Xu","Shengcai Liao","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2305.19599v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15773v1","updated":"2023-11-27T12:48:33Z","published":"2023-11-27T12:48:33Z","title":"Check, Locate, Rectify: A Training-Free Layout Calibration System for\n Text-to-Image Generation","summary":" Diffusion models have recently achieved remarkable progress in generating\nrealistic images. However, challenges remain in accurately understanding and\nsynthesizing the layout requirements in the textual prompts. To align the\ngenerated image with layout instructions, we present a training-free layout\ncalibration system SimM that intervenes in the generative process on the fly\nduring inference time. Specifically, following a \"check-locate-rectify\"\npipeline, the system first analyses the prompt to generate the target layout\nand compares it with the intermediate outputs to automatically detect errors.\nThen, by moving the located activations and making intra- and inter-map\nadjustments, the rectification process can be performed with negligible\ncomputational overhead. To evaluate SimM over a range of layout requirements,\nwe present a benchmark SimMBench that compensates for the lack of superlative\nspatial relations in existing datasets. And both quantitative and qualitative\nresults demonstrate the effectiveness of the proposed SimM in calibrating the\nlayout inconsistencies.\n","authors":["Biao Gong","Siteng Huang","Yutong Feng","Shiwei Zhang","Yuyuan Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14456v4","updated":"2023-11-27T12:45:22Z","published":"2022-11-26T02:15:35Z","title":"TetraSphere: A Neural Descriptor for O(3)-Invariant Point Cloud Analysis","summary":" In many practical applications, 3D point cloud analysis requires rotation\ninvariance. In this paper, we present a learnable descriptor invariant under 3D\nrotations and reflections, i.e., the O(3) actions, utilizing the recently\nintroduced steerable 3D spherical neurons and vector neurons. Specifically, we\npropose an embedding of the 3D spherical neurons into 4D vector neurons, which\nleverages end-to-end training of the model. In our approach, we perform\nTetraTransform--an equivariant embedding of the 3D input into 4D, constructed\nfrom the steerable neurons--and extract deeper O(3)-equivariant features using\nvector neurons. This integration of the TetraTransform into the VN-DGCNN\nframework, termed TetraSphere, negligibly increases the number of parameters by\nless than 0.0002%. TetraSphere sets a new state-of-the-art performance\nclassifying randomly rotated real-world object scans of the challenging subsets\nof ScanObjectNN. Additionally, TetraSphere outperforms all equivariant methods\non randomly rotated synthetic data: classifying objects from ModelNet40 and\nsegmenting parts of the ShapeNet shapes. Thus, our results reveal the practical\nvalue of steerable 3D spherical neurons for learning in 3D Euclidean space.\n","authors":["Pavlo Melnyk","Andreas Robinson","Michael Felsberg","Mårten Wadenbäck"],"pdf_url":"https://arxiv.org/pdf/2211.14456v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15769v1","updated":"2023-11-27T12:39:42Z","published":"2023-11-27T12:39:42Z","title":"Side4Video: Spatial-Temporal Side Network for Memory-Efficient\n Image-to-Video Transfer Learning","summary":" Large pre-trained vision models achieve impressive success in computer\nvision. However, fully fine-tuning large models for downstream tasks,\nparticularly in video understanding, can be prohibitively computationally\nexpensive. Recent studies turn their focus towards efficient image-to-video\ntransfer learning. Nevertheless, existing efficient fine-tuning methods lack\nattention to training memory usage and exploration of transferring a larger\nmodel to the video domain. In this paper, we present a novel Spatial-Temporal\nSide Network for memory-efficient fine-tuning large image models to video\nunderstanding, named Side4Video. Specifically, we introduce a lightweight\nspatial-temporal side network attached to the frozen vision model, which avoids\nthe backpropagation through the heavy pre-trained model and utilizes\nmulti-level spatial features from the original image model. Extremely\nmemory-efficient architecture enables our method to reduce 75% memory usage\nthan previous adapter-based methods. In this way, we can transfer a huge ViT-E\n(4.4B) for video understanding tasks which is 14x larger than ViT-L (304M). Our\napproach achieves remarkable performance on various video datasets across\nunimodal and cross-modal tasks (i.e., action recognition and text-video\nretrieval), especially in Something-Something V1&V2 (67.3% & 74.6%),\nKinetics-400 (88.6%), MSR-VTT (52.3%), MSVD (56.1%) and VATEX (68.8%). We\nrelease our code at https://github.com/HJYao00/Side4Video.\n","authors":["Huanjin Yao","Wenhao Wu","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2311.15769v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2311.15759v1","updated":"2023-11-27T12:29:20Z","published":"2023-11-27T12:29:20Z","title":"Towards Vision Enhancing LLMs: Empowering Multimodal Knowledge Storage\n and Sharing in LLMs","summary":" Recent advancements in multimodal large language models (MLLMs) have achieved\nsignificant multimodal generation capabilities, akin to GPT-4. These models\npredominantly map visual information into language representation space,\nleveraging the vast knowledge and powerful text generation abilities of LLMs to\nproduce multimodal instruction-following responses. We could term this method\nas LLMs for Vision because of its employing LLMs for visual-language\nunderstanding, yet observe that these MLLMs neglect the potential of harnessing\nvisual knowledge to enhance overall capabilities of LLMs, which could be\nregraded as Vision Enhancing LLMs. In this paper, we propose an approach called\nMKS2, aimed at enhancing LLMs through empowering Multimodal Knowledge Storage\nand Sharing in LLMs. Specifically, we introduce the Modular Visual Memory, a\ncomponent integrated into the internal blocks of LLMs, designed to store\nopen-world visual information efficiently. Additionally, we present a soft\nMixtures-of-Multimodal Experts architecture in LLMs to invoke multimodal\nknowledge collaboration during generation. Our comprehensive experiments\ndemonstrate that MKS2 substantially augments the reasoning capabilities of LLMs\nin contexts necessitating physical or commonsense knowledge. It also delivers\ncompetitive results on multimodal benchmarks.\n","authors":["Yunxin Li","Baotian Hu","Wei Wang","Xiaochun Cao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15759v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2304.02833v2","updated":"2023-11-27T12:10:09Z","published":"2023-04-06T02:45:39Z","title":"DoUnseen: Tuning-Free Class-Adaptive Object Detection of Unseen Objects\n for Robotic Grasping","summary":" How can we segment varying numbers of objects where each specific object\nrepresents its own separate class? To make the problem even more realistic, how\ncan we add and delete classes on the fly without retraining or fine-tuning?\nThis is the case of robotic applications where no datasets of the objects exist\nor application that includes thousands of objects (E.g., in logistics) where it\nis impossible to train a single model to learn all of the objects. Most current\nresearch on object segmentation for robotic grasping focuses on class-level\nobject segmentation (E.g., box, cup, bottle), closed sets (specific objects of\na dataset; for example, YCB dataset), or deep learning-based template matching.\nIn this work, we are interested in open sets where the number of classes is\nunknown, varying, and without pre-knowledge about the objects' types. We\nconsider each specific object as its own separate class. Our goal is to develop\nan object detector that requires no fine-tuning and can add any object as a\nclass just by capturing a few images of the object. Our main idea is to break\nthe segmentation pipelines into two steps by combining unseen object\nsegmentation networks cascaded by class-adaptive classifiers. We evaluate our\nclass-adaptive object detector on unseen datasets and compare it to a trained\nMask R-CNN on those datasets. The results show that the performance varies from\npractical to unsuitable depending on the environment setup and the objects\nbeing handled. The code is available in our DoUnseen library repository.\n","authors":["Anas Gouda","Moritz Roidl"],"pdf_url":"https://arxiv.org/pdf/2304.02833v2.pdf","comment":"presented at RSS 2023 Workshop on Perception and Manipulation\n Challenges for Warehouse Automation"},{"id":"http://arxiv.org/abs/2311.15751v1","updated":"2023-11-27T12:08:46Z","published":"2023-11-27T12:08:46Z","title":"PyNanospacing: TEM image processing tool for strain analysis and\n visualization","summary":" The diverse spectrum of material characteristics including band gap,\nmechanical moduli, color, phonon and electronic density of states, along with\ncatalytic and surface properties are intricately intertwined with the atomic\nstructure and the corresponding interatomic bond-lengths. This interconnection\nextends to the manifestation of interplanar spacings within a crystalline\nlattice. Analysis of these interplanar spacings and the comprehension of any\ndeviations, whether it be lattice compression or expansion, commonly referred\nto as strain, hold paramount significance in unraveling various unknowns within\nthe field. Transmission Electron Microscopy (TEM) is widely used to capture\natomic-scale ordering, facilitating direct investigation of interplanar\nspacings. However, creating critical contour maps for visualizing and\ninterpreting lattice stresses in TEM images remains a challenging task. Here we\ndeveloped a Python code for TEM image processing that can handle a wide range\nof materials including nanoparticles, 2D materials, pure crystals and solid\nsolutions. This algorithm converts local differences in interplanar spacings\ninto contour maps allowing for a visual representation of lattice expansion and\ncompression. The tool is very generic and can significantly aid in analyzing\nmaterial properties using TEM images, allowing for a more in-depth exploration\nof the underlying science behind strain engineering via strain contour maps at\nthe atomic level.\n","authors":["Mehmet Ali Sarsil","Mubashir Mansoor","Mert Saracoglu","Servet Timur","Mustafa Urgen","Onur Ergen"],"pdf_url":"https://arxiv.org/pdf/2311.15751v1.pdf","comment":"Preprint, 13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.15744v1","updated":"2023-11-27T12:02:42Z","published":"2023-11-27T12:02:42Z","title":"One More Step: A Versatile Plug-and-Play Module for Rectifying Diffusion\n Schedule Flaws and Enhancing Low-Frequency Controls","summary":" It is well known that many open-released foundational diffusion models have\ndifficulty in generating images that substantially depart from average\nbrightness, despite such images being present in the training data. This is due\nto an inconsistency: while denoising starts from pure Gaussian noise during\ninference, the training noise schedule retains residual data even in the final\ntimestep distribution, due to difficulties in numerical conditioning in\nmainstream formulation, leading to unintended bias during inference. To\nmitigate this issue, certain $\\epsilon$-prediction models are combined with an\nad-hoc offset-noise methodology. In parallel, some contemporary models have\nadopted zero-terminal SNR noise schedules together with\n$\\mathbf{v}$-prediction, which necessitate major alterations to pre-trained\nmodels. However, such changes risk destabilizing a large multitude of\ncommunity-driven applications anchored on these pre-trained models. In light of\nthis, our investigation revisits the fundamental causes, leading to our\nproposal of an innovative and principled remedy, called One More Step (OMS). By\nintegrating a compact network and incorporating an additional simple yet\neffective step during inference, OMS elevates image fidelity and harmonizes the\ndichotomy between training and inference, while preserving original model\nparameters. Once trained, various pre-trained diffusion models with the same\nlatent domain can share the same OMS module.\n","authors":["Minghui Hu","Jianbin Zheng","Chuanxia Zheng","Chaoyue Wang","Dacheng Tao","Tat-Jen Cham"],"pdf_url":"https://arxiv.org/pdf/2311.15744v1.pdf","comment":"Project Page: https://jabir-zheng.github.io/OneMoreStep/, Demo Page:\n https://huggingface.co/spaces/h1t/oms_sdxl_lcm"},{"id":"http://arxiv.org/abs/2311.15741v1","updated":"2023-11-27T11:46:30Z","published":"2023-11-27T11:46:30Z","title":"Machine Learning-Based Jamun Leaf Disease Detection: A Comprehensive\n Review","summary":" Jamun leaf diseases pose a significant threat to agricultural productivity,\nnegatively impacting both yield and quality in the jamun industry. The advent\nof machine learning has opened up new avenues for tackling these diseases\neffectively. Early detection and diagnosis are essential for successful crop\nmanagement. While no automated systems have yet been developed specifically for\njamun leaf disease detection, various automated systems have been implemented\nfor similar types of disease detection using image processing techniques. This\npaper presents a comprehensive review of machine learning methodologies\nemployed for diagnosing plant leaf diseases through image classification, which\ncan be adapted for jamun leaf disease detection. It meticulously assesses the\nstrengths and limitations of various Vision Transformer models, including\nTransfer learning model and vision transformer (TLMViT), SLViT, SE-ViT,\nIterationViT, Tiny-LeViT, IEM-ViT, GreenViT, and PMViT. Additionally, the paper\nreviews models such as Dense Convolutional Network (DenseNet), Residual Neural\nNetwork (ResNet)-50V2, EfficientNet, Ensemble model, Convolutional Neural\nNetwork (CNN), and Locally Reversible Transformer. These machine-learning\nmodels have been evaluated on various datasets, demonstrating their real-world\napplicability. This review not only sheds light on current advancements in the\nfield but also provides valuable insights for future research directions in\nmachine learning-based jamun leaf disease detection and classification.\n","authors":["Auvick Chandra Bhowmik","Dr. Md. Taimur Ahad","Yousuf Rayhan Emon"],"pdf_url":"https://arxiv.org/pdf/2311.15741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15740v1","updated":"2023-11-27T11:44:46Z","published":"2023-11-27T11:44:46Z","title":"Optimization of Image Processing Algorithms for Character Recognition in\n Cultural Typewritten Documents","summary":" Linked Data is used in various fields as a new way of structuring and\nconnecting data. Cultural heritage institutions have been using linked data to\nimprove archival descriptions and facilitate the discovery of information. Most\narchival records have digital representations of physical artifacts in the form\nof scanned images that are non-machine-readable. Optical Character Recognition\n(OCR) recognizes text in images and translates it into machine-encoded text.\nThis paper evaluates the impact of image processing methods and parameter\ntuning in OCR applied to typewritten cultural heritage documents. The approach\nuses a multi-objective problem formulation to minimize Levenshtein edit\ndistance and maximize the number of words correctly identified with a\nnon-dominated sorting genetic algorithm (NSGA-II) to tune the methods'\nparameters. Evaluation results show that parameterization by digital\nrepresentation typology benefits the performance of image pre-processing\nalgorithms in OCR. Furthermore, our findings suggest that employing image\npre-processing algorithms in OCR might be more suitable for typologies where\nthe text recognition task without pre-processing does not produce good results.\nIn particular, Adaptive Thresholding, Bilateral Filter, and Opening are the\nbest-performing algorithms for the theatre plays' covers, letters, and overall\ndataset, respectively, and should be applied before OCR to improve its\nperformance.\n","authors":["Mariana Dias","Carla Teixeira Lopes"],"pdf_url":"https://arxiv.org/pdf/2311.15740v1.pdf","comment":"25 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.03017v3","updated":"2023-11-27T11:38:39Z","published":"2023-07-06T14:31:01Z","title":"RealLiFe: Real-Time Light Field Reconstruction via Hierarchical Sparse\n Gradient Descent","summary":" With the rise of Extended Reality (XR) technology, there is a growing need\nfor real-time light field generation from sparse view inputs. Existing methods\ncan be classified into offline techniques, which can generate high-quality\nnovel views but at the cost of long inference/training time, and online\nmethods, which either lack generalizability or produce unsatisfactory results.\nHowever, we have observed that the intrinsic sparse manifold of Multi-plane\nImages (MPI) enables a significant acceleration of light field generation while\nmaintaining rendering quality. Based on this insight, we introduce EffLiFe, a\nnovel light field optimization method, which leverages the proposed\nHierarchical Sparse Gradient Descent (HSGD) to produce high-quality light\nfields from sparse view images in real time. Technically, the coarse MPI of a\nscene is first generated using a 3D CNN, and it is further sparsely optimized\nby focusing only on important MPI gradients in a few iterations. Nevertheless,\nrelying solely on optimization can lead to artifacts at occlusion boundaries.\nTherefore, we propose an occlusion-aware iterative refinement module that\nremoves visual artifacts in occluded regions by iteratively filtering the\ninput. Extensive experiments demonstrate that our method achieves comparable\nvisual quality while being 100x faster on average than state-of-the-art offline\nmethods and delivering better performance (about 2 dB higher in PSNR) compared\nto other online approaches.\n","authors":["Yijie Deng","Lei Han","Tianpeng Lin","Lin Li","Jinzhi Zhang","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2307.03017v3.pdf","comment":"Submitted to IEEE TPAMI"},{"id":"http://arxiv.org/abs/2311.15732v1","updated":"2023-11-27T11:29:10Z","published":"2023-11-27T11:29:10Z","title":"GPT4Vis: What Can GPT-4 Do for Zero-shot Visual Recognition?","summary":" This paper does not present a novel method. Instead, it delves into an\nessential, yet must-know baseline in light of the latest advancements in\nGenerative Artificial Intelligence (GenAI): the utilization of GPT-4 for visual\nunderstanding. Our study centers on the evaluation of GPT-4's linguistic and\nvisual capabilities in zero-shot visual recognition tasks. Specifically, we\nexplore the potential of its generated rich textual descriptions across various\ncategories to enhance recognition performance without any training.\nAdditionally, we evaluate its visual proficiency in directly recognizing\ndiverse visual content. To achieve this, we conduct an extensive series of\nexperiments, systematically quantifying the performance of GPT-4 across three\nmodalities: images, videos, and point clouds. This comprehensive evaluation\nencompasses a total of 16 widely recognized benchmark datasets, providing top-1\nand top-5 accuracy metrics. Our study reveals that leveraging GPT-4's advanced\nlinguistic knowledge to generate rich descriptions markedly improves zero-shot\nrecognition. In terms of visual proficiency, GPT-4V's average performance\nacross 16 datasets sits roughly between the capabilities of OpenAI-CLIP's ViT-L\nand EVA-CLIP's ViT-E. We hope that this research will contribute valuable data\npoints and experience for future studies. We release our code at\nhttps://github.com/whwu95/GPT4Vis.\n","authors":["Wenhao Wu","Huanjin Yao","Mengxi Zhang","Yuxin Song","Wanli Ouyang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15732v1.pdf","comment":"Technical report. Work in progress"},{"id":"http://arxiv.org/abs/2311.15728v1","updated":"2023-11-27T11:26:41Z","published":"2023-11-27T11:26:41Z","title":"Adinkra Symbol Recognition using Classical Machine Learning and Deep\n Learning","summary":" Artificial intelligence (AI) has emerged as a transformative influence,\nengendering paradigm shifts in global societies, spanning academia and\nindustry. However, in light of these rapid advances, addressing the\nunderrepresentation of black communities and African countries in AI is\ncrucial. Boosting enthusiasm for AI can be effectively accomplished by\nshowcasing straightforward applications around tasks like identifying and\ncategorizing traditional symbols, such as Adinkra symbols, or familiar objects\nwithin the community. In this research endeavor, we dived into classical\nmachine learning and harnessed the power of deep learning models to tackle the\nintricate task of classifying and recognizing Adinkra symbols. The idea led to\na newly constructed ADINKRA dataset comprising 174,338 images meticulously\norganized into 62 distinct classes, each representing a singular and emblematic\nsymbol. We constructed a CNN model for classification and recognition using six\nconvolutional layers, three fully connected (FC) layers, and optional dropout\nregularization. The model is a simpler and smaller version of VGG, with fewer\nlayers, smaller channel sizes, and a fixed kernel size. Additionally, we tap\ninto the transfer learning capabilities provided by pre-trained models like VGG\nand ResNet. These models assist us in both classifying images and extracting\nfeatures that can be used with classical machine learning models. We assess the\nmodel's performance by measuring its accuracy and convergence rate and\nvisualizing the areas that significantly influence its predictions. These\nevaluations serve as a foundational benchmark for future assessments of the\nADINKRA dataset. We hope this application exemplar inspires ideas on the\nvarious uses of AI in organizing our traditional and modern lives.\n","authors":["Michael Adjeisah","Kwame Omono Asamoah","Martha Asamoah Yeboah","Raji Rafiu King","Godwin Ferguson Achaab","Kingsley Adjei"],"pdf_url":"https://arxiv.org/pdf/2311.15728v1.pdf","comment":"15 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.15727v1","updated":"2023-11-27T11:24:25Z","published":"2023-11-27T11:24:25Z","title":"MARIS: Referring Image Segmentation via Mutual-Aware Attention Features","summary":" Referring image segmentation (RIS) aims to segment a particular region based\non a language expression prompt. Existing methods incorporate linguistic\nfeatures into visual features and obtain multi-modal features for mask\ndecoding. However, these methods may segment the visually salient entity\ninstead of the correct referring region, as the multi-modal features are\ndominated by the abundant visual context. In this paper, we propose MARIS, a\nreferring image segmentation method that leverages the Segment Anything Model\n(SAM) and introduces a mutual-aware attention mechanism to enhance the\ncross-modal fusion via two parallel branches. Specifically, our mutual-aware\nattention mechanism consists of Vision-Guided Attention and Language-Guided\nAttention, which bidirectionally model the relationship between visual and\nlinguistic features. Correspondingly, we design a Mask Decoder to enable\nexplicit linguistic guidance for more consistent segmentation with the language\nexpression. To this end, a multi-modal query token is proposed to integrate\nlinguistic information and interact with visual information simultaneously.\nExtensive experiments on three benchmark datasets show that our method\noutperforms the state-of-the-art RIS methods. Our code will be publicly\navailable.\n","authors":["Mengxi Zhang","Yiming Liu","Xiangjun Yin","Huanjing Yue","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15722v1","updated":"2023-11-27T11:17:20Z","published":"2023-11-27T11:17:20Z","title":"GLIME: General, Stable and Local LIME Explanation","summary":" As black-box machine learning models grow in complexity and find applications\nin high-stakes scenarios, it is imperative to provide explanations for their\npredictions. Although Local Interpretable Model-agnostic Explanations (LIME)\n[22] is a widely adpoted method for understanding model behaviors, it is\nunstable with respect to random seeds [35,24,3] and exhibits low local fidelity\n(i.e., how well the explanation approximates the model's local behaviors)\n[21,16]. Our study shows that this instability problem stems from small sample\nweights, leading to the dominance of regularization and slow convergence.\nAdditionally, LIME's sampling neighborhood is non-local and biased towards the\nreference, resulting in poor local fidelity and sensitivity to reference\nchoice. To tackle these challenges, we introduce GLIME, an enhanced framework\nextending LIME and unifying several prior methods. Within the GLIME framework,\nwe derive an equivalent formulation of LIME that achieves significantly faster\nconvergence and improved stability. By employing a local and unbiased sampling\ndistribution, GLIME generates explanations with higher local fidelity compared\nto LIME. GLIME explanations are independent of reference choice. Moreover,\nGLIME offers users the flexibility to choose a sampling distribution based on\ntheir specific scenarios.\n","authors":["Zeren Tan","Yang Tian","Jian Li"],"pdf_url":"https://arxiv.org/pdf/2311.15722v1.pdf","comment":"Accepted by NeurIPS 2023 as a Spotlight paper"},{"id":"http://arxiv.org/abs/2311.15719v1","updated":"2023-11-27T11:12:33Z","published":"2023-11-27T11:12:33Z","title":"Variational Autoencoders for Feature Exploration and Malignancy\n Prediction of Lung Lesions","summary":" Lung cancer is responsible for 21% of cancer deaths in the UK and five-year\nsurvival rates are heavily influenced by the stage the cancer was identified\nat. Recent studies have demonstrated the capability of AI methods for accurate\nand early diagnosis of lung cancer from routine scans. However, this evidence\nhas not translated into clinical practice with one barrier being a lack of\ninterpretable models. This study investigates the application Variational\nAutoencoders (VAEs), a type of generative AI model, to lung cancer lesions.\nProposed models were trained on lesions extracted from 3D CT scans in the\nLIDC-IDRI public dataset. Latent vector representations of 2D slices produced\nby the VAEs were explored through clustering to justify their quality and used\nin an MLP classifier model for lung cancer diagnosis, the best model achieved\nstate-of-the-art metrics of AUC 0.98 and 93.1% accuracy. Cluster analysis shows\nthe VAE latent space separates the dataset of malignant and benign lesions\nbased on meaningful feature components including tumour size, shape, patient\nand malignancy class. We also include a comparative analysis of the standard\nGaussian VAE (GVAE) and the more recent Dirichlet VAE (DirVAE), which replaces\nthe prior with a Dirichlet distribution to encourage a more explainable latent\nspace with disentangled feature representation. Finally, we demonstrate the\npotential for latent space traversals corresponding to clinically meaningful\nfeature changes.\n","authors":["Benjamin Keel","Aaron Quyn","David Jayne","Samuel D. Relton"],"pdf_url":"https://arxiv.org/pdf/2311.15719v1.pdf","comment":"10 pages (main paper), 5 pages (references), 5 figures, 2 tables,\n work accepted for BMVC 2023"},{"id":"http://arxiv.org/abs/2309.12303v3","updated":"2023-11-27T11:04:07Z","published":"2023-09-21T17:59:02Z","title":"PanoVOS: Bridging Non-panoramic and Panoramic Views with Transformer for\n Video Segmentation","summary":" Panoramic videos contain richer spatial information and have attracted\ntremendous amounts of attention due to their exceptional experience in some\nfields such as autonomous driving and virtual reality. However, existing\ndatasets for video segmentation only focus on conventional planar images. To\naddress the challenge, in this paper, we present a panoramic video dataset,\nPanoVOS. The dataset provides 150 videos with high video resolutions and\ndiverse motions. To quantify the domain gap between 2D planar videos and\npanoramic videos, we evaluate 15 off-the-shelf video object segmentation (VOS)\nmodels on PanoVOS. Through error analysis, we found that all of them fail to\ntackle pixel-level content discontinues of panoramic videos. Thus, we present a\nPanoramic Space Consistency Transformer (PSCFormer), which can effectively\nutilize the semantic boundary information of the previous frame for pixel-level\nmatching with the current frame. Extensive experiments demonstrate that\ncompared with the previous SOTA models, our PSCFormer network exhibits a great\nadvantage in terms of segmentation results under the panoramic setting. Our\ndataset poses new challenges in panoramic VOS and we hope that our PanoVOS can\nadvance the development of panoramic segmentation/tracking.\n","authors":["Shilin Yan","Xiaohao Xu","Renrui Zhang","Lingyi Hong","Wenchao Chen","Wenqiang Zhang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.12303v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15707v1","updated":"2023-11-27T10:50:47Z","published":"2023-11-27T10:50:47Z","title":"SAM-6D: Segment Anything Model Meets Zero-Shot 6D Object Pose Estimation","summary":" Zero-shot 6D object pose estimation involves the detection of novel objects\nwith their 6D poses in cluttered scenes, presenting significant challenges for\nmodel generalizability. Fortunately, the recent Segment Anything Model (SAM)\nhas showcased remarkable zero-shot transfer performance, which provides a\npromising solution to tackle this task. Motivated by this, we introduce SAM-6D,\na novel framework designed to realize the task through two steps, including\ninstance segmentation and pose estimation. Given the target objects, SAM-6D\nemploys two dedicated sub-networks, namely Instance Segmentation Model (ISM)\nand Pose Estimation Model (PEM), to perform these steps on cluttered RGB-D\nimages. ISM takes SAM as an advanced starting point to generate all possible\nobject proposals and selectively preserves valid ones through meticulously\ncrafted object matching scores in terms of semantics, appearance and geometry.\nBy treating pose estimation as a partial-to-partial point matching problem, PEM\nperforms a two-stage point matching process featuring a novel design of\nbackground tokens to construct dense 3D-3D correspondence, ultimately yielding\nthe pose estimates. Without bells and whistles, SAM-6D outperforms the existing\nmethods on the seven core datasets of the BOP Benchmark for both instance\nsegmentation and pose estimation of novel objects.\n","authors":["Jiehong Lin","Lihua Liu","Dekun Lu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2311.15707v1.pdf","comment":"Github Page: https://github.com/JiehongLin/SAM-6D"},{"id":"http://arxiv.org/abs/2309.17389v4","updated":"2023-11-27T10:46:03Z","published":"2023-09-29T16:50:38Z","title":"Prompt-based test-time real image dehazing: a novel pipeline","summary":" Existing methods attempt to improve models' generalization ability on\nreal-world hazy images by exploring well-designed training schemes (e.g.,\nCycleGAN, prior loss). However, most of them need very complicated training\nprocedures to achieve satisfactory results. In this work, we present a totally\nnovel testing pipeline called Prompt-based Test-Time Dehazing (PTTD) to help\ngenerate visually pleasing results of real-captured hazy images during the\ninference phase. We experimentally find that given a dehazing model trained on\nsynthetic data, by fine-tuning the statistics (i.e., mean and standard\ndeviation) of encoding features, PTTD is able to narrow the domain gap,\nboosting the performance of real image dehazing. Accordingly, we first apply a\nprompt generation module (PGM) to generate a visual prompt, which is the source\nof appropriate statistical perturbations for mean and standard deviation. And\nthen, we employ the feature adaptation module (FAM) into the existing dehazing\nmodels for adjusting the original statistics with the guidance of the generated\nprompt. Note that, PTTD is model-agnostic and can be equipped with various\nstate-of-the-art dehazing models trained on synthetic hazy-clean pairs.\nExtensive experimental results demonstrate that our PTTD is flexible meanwhile\nachieves superior performance against state-of-the-art dehazing methods in\nreal-world scenarios. The source code of our PTTD will be made available at\nhttps://github.com/cecret3350/PTTD-Dehazing.\n","authors":["Zixuan Chen","Zewei He","Ziqian Lu","Xuecheng Sun","Zhe-Ming Lu"],"pdf_url":"https://arxiv.org/pdf/2309.17389v4.pdf","comment":"update github link (https://github.com/cecret3350/PTTD-Dehazing)"},{"id":"http://arxiv.org/abs/2311.13372v2","updated":"2023-11-27T10:42:46Z","published":"2023-11-22T13:13:19Z","title":"MRGazer: Decoding Eye Gaze Points from Functional Magnetic Resonance\n Imaging in Individual Space","summary":" Eye-tracking research has proven valuable in understanding numerous cognitive\nfunctions. Recently, Frey et al. provided an exciting deep learning method for\nlearning eye movements from fMRI data. However, it needed to co-register fMRI\ninto standard space to obtain eyeballs masks, and thus required additional\ntemplates and was time consuming. To resolve this issue, in this paper, we\npropose a framework named MRGazer for predicting eye gaze points from fMRI in\nindividual space. The MRGazer consisted of eyeballs extraction module and a\nresidual network-based eye gaze prediction. Compared to the previous method,\nthe proposed framework skips the fMRI co-registration step, simplifies the\nprocessing protocol and achieves end-to-end eye gaze regression. The proposed\nmethod achieved superior performance in a variety of eye movement tasks than\nthe co-registration-based method, and delivered objective results within a\nshorter time (~ 0.02 Seconds for each volume) than prior method (~0.3 Seconds\nfor each volume).\n","authors":["Xiuwen Wu","Rongjie Hu","Jie Liang","Yanming Wang","Bensheng Qiu","Xiaoxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01825v2","updated":"2023-11-27T10:39:13Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":" Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06226v2","updated":"2023-11-27T10:31:09Z","published":"2023-03-10T22:21:30Z","title":"NeRFlame: FLAME-based conditioning of NeRF for 3D face rendering","summary":" Traditional 3D face models are based on mesh representations with texture.\nOne of the most important models is FLAME (Faces Learned with an Articulated\nModel and Expressions), which produces meshes of human faces that are fully\ncontrollable. Unfortunately, such models have problems with capturing geometric\nand appearance details. In contrast to mesh representation, the neural radiance\nfield (NeRF) produces extremely sharp renders. However, implicit methods are\nhard to animate and do not generalize well to unseen expressions. It is not\ntrivial to effectively control NeRF models to obtain face manipulation.\n The present paper proposes a novel approach, named NeRFlame, which combines\nthe strengths of both NeRF and FLAME methods. Our method enables high-quality\nrendering capabilities of NeRF while also offering complete control over the\nvisual appearance, similar to FLAME. In contrast to traditional NeRF-based\nstructures that use neural networks for RGB color and volume density modeling,\nour approach utilizes the FLAME mesh as a distinct density volume.\nConsequently, color values exist only in the vicinity of the FLAME mesh. This\nFLAME framework is seamlessly incorporated into the NeRF architecture for\npredicting RGB colors, enabling our model to explicitly represent volume\ndensity and implicitly capture RGB colors.\n","authors":["Wojciech Zając","Joanna Waczyńska","Piotr Borycki","Jacek Tabor","Maciej Zięba","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2303.06226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15679v1","updated":"2023-11-27T10:10:25Z","published":"2023-11-27T10:10:25Z","title":"Model-agnostic Body Part Relevance Assessment for Pedestrian Detection","summary":" Model-agnostic explanation methods for deep learning models are flexible\nregarding usability and availability. However, due to the fact that they can\nonly manipulate input to see changes in output, they suffer from weak\nperformance when used with complex model architectures. For models with large\ninputs as, for instance, in object detection, sampling-based methods like\nKernelSHAP are inefficient due to many computation-heavy forward passes through\nthe model. In this work, we present a framework for using sampling-based\nexplanation models in a computer vision context by body part relevance\nassessment for pedestrian detection. Furthermore, we introduce a novel\nsampling-based method similar to KernelSHAP that shows more robustness for\nlower sampling sizes and, thus, is more efficient for explainability analyses\non large-scale datasets.\n","authors":["Maurice Günder","Sneha Banerjee","Rafet Sifa","Christian Bauckhage"],"pdf_url":"https://arxiv.org/pdf/2311.15679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15672v1","updated":"2023-11-27T10:01:31Z","published":"2023-11-27T10:01:31Z","title":"HAVE-FUN: Human Avatar Reconstruction from Few-Shot Unconstrained Images","summary":" As for human avatar reconstruction, contemporary techniques commonly\nnecessitate the acquisition of costly data and struggle to achieve satisfactory\nresults from a small number of casual images. In this paper, we investigate\nthis task from a few-shot unconstrained photo album. The reconstruction of\nhuman avatars from such data sources is challenging because of limited data\namount and dynamic articulated poses. For handling dynamic data, we integrate a\nskinning mechanism with deep marching tetrahedra (DMTet) to form a drivable\ntetrahedral representation, which drives arbitrary mesh topologies generated by\nthe DMTet for the adaptation of unconstrained images. To effectively mine\ninstructive information from few-shot data, we devise a two-phase optimization\nmethod with few-shot reference and few-shot guidance. The former focuses on\naligning avatar identity with reference images, while the latter aims to\ngenerate plausible appearances for unseen regions. Overall, our framework,\ncalled HaveFun, can undertake avatar reconstruction, rendering, and animation.\nExtensive experiments on our developed benchmarks demonstrate that HaveFun\nexhibits substantially superior performance in reconstructing the human body\nand hand. Project website: https://seanchenxy.github.io/HaveFunWeb/.\n","authors":["Xihe Yang","Xingyu Chen","Shaohui Wang","Daiheng Gao","Xiaoguang Han","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15668v1","updated":"2023-11-27T09:55:55Z","published":"2023-11-27T09:55:55Z","title":"Deformation-Guided Unsupervised Non-Rigid Shape Matching","summary":" We present an unsupervised data-driven approach for non-rigid shape matching.\nShape matching identifies correspondences between two shapes and is a\nfundamental step in many computer vision and graphics applications. Our\napproach is designed to be particularly robust when matching shapes digitized\nusing 3D scanners that contain fine geometric detail and suffer from different\ntypes of noise including topological noise caused by the coalescence of\nspatially close surface regions. We build on two strategies. First, using a\nhierarchical patch based shape representation we match shapes consistently in a\ncoarse to fine manner, allowing for robustness to noise. This multi-scale\nrepresentation drastically reduces the dimensionality of the problem when\nmatching at the coarsest scale, rendering unsupervised learning feasible.\nSecond, we constrain this hierarchical matching to be reflected in 3D by\nfitting a patch-wise near-rigid deformation model. Using this constraint, we\nleverage spatial continuity at different scales to capture global shape\nproperties, resulting in matchings that generalize well to data with different\ndeformations and noise characteristics. Experiments demonstrate that our\napproach obtains significantly better results on raw 3D scans than\nstate-of-the-art methods, while performing on-par on standard test scenarios.\n","authors":["Aymen Merrouche","Joao Regateiro","Stefanie Wuhrer","Edmond Boyer"],"pdf_url":"https://arxiv.org/pdf/2311.15668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13959v2","updated":"2023-11-27T09:47:10Z","published":"2023-11-23T12:17:45Z","title":"RankFeat&RankWeight: Rank-1 Feature/Weight Removal for\n Out-of-distribution Detection","summary":" The task of out-of-distribution (OOD) detection is crucial for deploying\nmachine learning models in real-world settings. In this paper, we observe that\nthe singular value distributions of the in-distribution (ID) and OOD features\nare quite different: the OOD feature matrix tends to have a larger dominant\nsingular value than the ID feature, and the class predictions of OOD samples\nare largely determined by it. This observation motivates us to propose\n\\texttt{RankFeat}, a simple yet effective \\emph{post hoc} approach for OOD\ndetection by removing the rank-1 matrix composed of the largest singular value\nand the associated singular vectors from the high-level feature.\n\\texttt{RankFeat} achieves \\emph{state-of-the-art} performance and reduces the\naverage false positive rate (FPR95) by 17.90\\% compared with the previous best\nmethod. The success of \\texttt{RankFeat} motivates us to investigate whether a\nsimilar phenomenon would exist in the parameter matrices of neural networks. We\nthus propose \\texttt{RankWeight} which removes the rank-1 weight from the\nparameter matrices of a single deep layer. Our \\texttt{RankWeight}is also\n\\emph{post hoc} and only requires computing the rank-1 matrix once. As a\nstandalone approach, \\texttt{RankWeight} has very competitive performance\nagainst other methods across various backbones. Moreover, \\texttt{RankWeight}\nenjoys flexible compatibility with a wide range of OOD detection methods. The\ncombination of \\texttt{RankWeight} and \\texttt{RankFeat} refreshes the new\n\\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\\% on\nthe ImageNet-1k benchmark. Extensive ablation studies and comprehensive\ntheoretical analyses are presented to support the empirical results.\n","authors":["Yue Song","Nicu Sebe","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13959v2.pdf","comment":"submitted to T-PAMI. arXiv admin note: substantial text overlap with\n arXiv:2209.08590"},{"id":"http://arxiv.org/abs/2311.15660v1","updated":"2023-11-27T09:40:53Z","published":"2023-11-27T09:40:53Z","title":"Technical Report for Argoverse Challenges on 4D Occupancy Forecasting","summary":" This report presents our Le3DE2E_Occ solution for 4D Occupancy Forecasting in\nArgoverse Challenges at CVPR 2023 Workshop on Autonomous Driving (WAD). Our\nsolution consists of a strong LiDAR-based Bird's Eye View (BEV) encoder with\ntemporal fusion and a two-stage decoder, which combines a DETR head and a UNet\ndecoder. The solution was tested on the Argoverse 2 sensor dataset to evaluate\nthe occupancy state 3 seconds in the future. Our solution achieved 18% lower L1\nError (3.57) than the baseline and got the 1 place on the 4D Occupancy\nForecasting task in Argoverse Challenges at CVPR 2023.\n","authors":["Pengfei Zheng","Kanokphan Lertniphonphan","Feng Chen","Siwei Chen","Bingchuan Sun","Jun Xie","Zhepeng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15658v1","updated":"2023-11-27T09:40:14Z","published":"2023-11-27T09:40:14Z","title":"Regularization by Texts for Latent Diffusion Inverse Solvers","summary":" The recent advent of diffusion models has led to significant progress in\nsolving inverse problems, leveraging these models as effective generative\npriors. Nonetheless, challenges related to the ill-posed nature of such\nproblems remain, often due to inherent ambiguities in measurements. Drawing\ninspiration from the human ability to resolve visual ambiguities through\nperceptual biases, here we introduce a novel latent diffusion inverse solver by\nincorporating regularization by texts (TReg). Specifically, TReg applies the\ntextual description of the preconception of the solution during the reverse\nsampling phase, of which description isndynamically reinforced through\nnull-text optimization for adaptive negation. Our comprehensive experimental\nresults demonstrate that TReg successfully mitigates ambiguity in latent\ndiffusion inverse solvers, enhancing their effectiveness and accuracy.\n","authors":["Jeongsol Kim","Geon Yeong Park","Hyungjin Chung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15657v1","updated":"2023-11-27T09:39:45Z","published":"2023-11-27T09:39:45Z","title":"Enhancing Diffusion Models with Text-Encoder Reinforcement Learning","summary":" Text-to-image diffusion models are typically trained to optimize the\nlog-likelihood objective, which presents challenges in meeting specific\nrequirements for downstream tasks, such as image aesthetics and image-text\nalignment. Recent research addresses this issue by refining the diffusion U-Net\nusing human rewards through reinforcement learning or direct backpropagation.\nHowever, many of them overlook the importance of the text encoder, which is\ntypically pretrained and fixed during training. In this paper, we demonstrate\nthat by finetuning the text encoder through reinforcement learning, we can\nenhance the text-image alignment of the results, thereby improving the visual\nquality. Our primary motivation comes from the observation that the current\ntext encoder is suboptimal, often requiring careful prompt adjustment. While\nfine-tuning the U-Net can partially improve performance, it remains suffering\nfrom the suboptimal text encoder. Therefore, we propose to use reinforcement\nlearning with low-rank adaptation to finetune the text encoder based on\ntask-specific rewards, referred as \\textbf{TexForce}. We first show that\nfinetuning the text encoder can improve the performance of diffusion models.\nThen, we illustrate that TexForce can be simply combined with existing U-Net\nfinetuned models to get much better results without additional training.\nFinally, we showcase the adaptability of our method in diverse applications,\nincluding the generation of high-quality face and hand images.\n","authors":["Chaofeng Chen","Annan Wang","Haoning Wu","Liang Liao","Wenxiu Sun","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2311.15657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15648v1","updated":"2023-11-27T09:20:12Z","published":"2023-11-27T09:20:12Z","title":"Reinforcement Learning from Diffusion Feedback: Q* for Image Search","summary":" Large vision-language models are steadily gaining personalization\ncapabilities at the cost of fine-tuning or data augmentation. We present two\nmodels for image generation using model-agnostic learning that align semantic\npriors with generative capabilities. RLDF, or Reinforcement Learning from\nDiffusion Feedback, is a singular approach for visual imitation through\nprior-preserving reward function guidance. This employs Q-learning (with\nstandard Q*) for generation and follows a semantic-rewarded trajectory for\nimage search through finite encoding-tailored actions. The second proposed\nmethod, noisy diffusion gradient, is optimization driven. At the root of both\nmethods is a special CFG encoding that we propose for continual semantic\nguidance. Using only a single input image and no text input, RLDF generates\nhigh-quality images over varied domains including retail, sports and\nagriculture showcasing class-consistency and strong visual diversity. Project\nwebsite is available at https://infernolia.github.io/RLDF.\n","authors":["Aboli Marathe"],"pdf_url":"https://arxiv.org/pdf/2311.15648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14969v2","updated":"2023-11-27T09:06:55Z","published":"2023-08-29T01:47:49Z","title":"Uncovering the Hidden Cost of Model Compression","summary":" In the era of resource-intensive foundation models, efficient adaptation in\ndownstream tasks has become paramount. Visual Prompting (VP), inspired by\nprompting in Large Language Models (LLMs), has emerged as a key transfer\nlearning method in computer vision. Aligned with the growing significance of\nefficiency, research in model compression has become pivotal to alleviate the\ncomputational burden in both training and deploying over-parameterized neural\nnetworks. A key goal in model compression is the development of sparse models\ncapable of matching or surpassing the performance of their over-parameterized,\ndense counterparts. While prior research has explored the impact of model\nsparsity on transfer learning, its effects on visual prompting-based transfer\nremain unclear. This study addresses this gap, revealing that model sparsity\nadversely affects the performance of visual prompting-based transfer,\nparticularly in low-data-volume scenarios. Furthermore, our findings highlight\nthe negative influence of sparsity on the calibration of downstream\nvisual-prompted models. This empirical exploration calls for a nuanced\nunderstanding beyond accuracy in sparse settings, opening avenues for further\nresearch in Visual Prompting for sparse models. Code and logs can be accessed\nat https://github.com/landskape-ai/Reprogram_LT .\n","authors":["Diganta Misra","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2311.15637v1","updated":"2023-11-27T09:02:21Z","published":"2023-11-27T09:02:21Z","title":"PaintNeSF: Artistic Creation of Stylized Scenes with Vectorized 3D\n Strokes","summary":" We present Paint Neural Stroke Field (PaintNeSF), a novel technique to\ngenerate stylized images of a 3D scene at arbitrary novel views from multi-view\n2D images. Different from existing methods which apply stylization to trained\nneural radiance fields at the voxel level, our approach draws inspiration from\nimage-to-painting methods, simulating the progressive painting process of human\nartwork with vector strokes. We develop a palette of stylized 3D strokes from\nbasic primitives and splines, and consider the 3D scene stylization task as a\nmulti-view reconstruction process based on these 3D stroke primitives. Instead\nof directly searching for the parameters of these 3D strokes, which would be\ntoo costly, we introduce a differentiable renderer that allows optimizing\nstroke parameters using gradient descent, and propose a training scheme to\nalleviate the vanishing gradient issue. The extensive evaluation demonstrates\nthat our approach effectively synthesizes 3D scenes with significant geometric\nand aesthetic stylization while maintaining a consistent appearance across\ndifferent views. Our method can be further integrated with style loss and\nimage-text contrastive models to extend its applications, including color\ntransfer and text-driven 3D scene drawing.\n","authors":["Hao-Bin Duan","Miao Wang","Yan-Xun Li","Yong-Liang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00397v2","updated":"2023-11-27T09:02:06Z","published":"2023-11-01T09:46:59Z","title":"Towards Omni-supervised Referring Expression Segmentation","summary":" Referring Expression Segmentation (RES) is an emerging task in computer\nvision, which segments the target instances in images based on text\ndescriptions. However, its development is plagued by the expensive segmentation\nlabels. To address this issue, we propose a new learning task for RES called\nOmni-supervised Referring Expression Segmentation (Omni-RES), which aims to\nmake full use of unlabeled, fully labeled and weakly labeled data, e.g.,\nreferring points or grounding boxes, for efficient RES training. To accomplish\nthis task, we also propose a novel yet strong baseline method for Omni-RES\nbased on the recently popular teacher-student learning, where the weak labels\nare not directly transformed into supervision signals but used as a yardstick\nto select and refine high-quality pseudo-masks for teacher-student learning. To\nvalidate the proposed Omni-RES method, we apply it to a set of state-of-the-art\nRES models and conduct extensive experiments on a bunch of RES datasets. The\nexperimental results yield the obvious merits of Omni-RES than the\nfully-supervised and semi-supervised training schemes. For instance, with only\n10% fully labeled data, Omni-RES can help the base model achieve 100% fully\nsupervised performance, and it also outperform the semi-supervised alternative\nby a large margin, e.g., +14.93% on RefCOCO and +14.95% on RefCOCO+,\nrespectively. More importantly, Omni-RES also enable the use of large-scale\nvision-langauges like Visual Genome to facilitate low-cost RES training, and\nachieve new SOTA performance of RES, e.g., 80.66 on RefCOCO.\n","authors":["Minglang Huang","Yiyi Zhou","Gen Luo","Guannan Jiang","Weilin Zhuang","Xiaoshuai Sun"],"pdf_url":"https://arxiv.org/pdf/2311.00397v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15625v1","updated":"2023-11-27T08:44:00Z","published":"2023-11-27T08:44:00Z","title":"Only Positive Cases: 5-fold High-order Attention Interaction Model for\n Skin Segmentation Derived Classification","summary":" Computer-aided diagnosis of skin diseases is an important tool. However, the\ninterpretability of computer-aided diagnosis is currently poor. Dermatologists\nand patients cannot intuitively understand the learning and prediction process\nof neural networks, which will lead to a decrease in the credibility of\ncomputer-aided diagnosis. In addition, traditional methods need to be trained\nusing negative samples in order to predict the presence or absence of a lesion,\nbut medical data is often in short supply. In this paper, we propose a multiple\nhigh-order attention interaction model (MHA-UNet) for use in a highly\nexplainable skin lesion segmentation task. MHA-UNet is able to obtain the\npresence or absence of a lesion by explainable reasoning without the need for\ntraining on negative samples. Specifically, we propose a high-order attention\ninteraction mechanism that introduces squeeze attention to a higher level for\nfeature attention. In addition, a multiple high-order attention interaction\n(MHAblock) module is proposed by combining the different features of different\norders. For classifying the presence or absence of lesions, we conducted\nclassification experiments on several publicly available datasets in the\nabsence of negative samples, based on explainable reasoning about the\ninteraction of 5 attention orders of MHAblock. The highest positive detection\nrate obtained from the experiments was 81.0% and the highest negative detection\nrate was 83.5%. For segmentation experiments, comparison experiments of the\nproposed method with 13 medical segmentation models and external validation\nexperiments with 8 state-of-the-art models in three public datasets and our\nclinical dataset demonstrate the state-of-the-art performance of our model. The\ncode is available from https://github.com/wurenkai/MHA-UNet.\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2311.15625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08872v5","updated":"2023-11-27T08:42:07Z","published":"2023-10-13T05:48:42Z","title":"R&B: Region and Boundary Aware Zero-shot Grounded Text-to-image\n Generation","summary":" Recent text-to-image (T2I) diffusion models have achieved remarkable progress\nin generating high-quality images given text-prompts as input. However, these\nmodels fail to convey appropriate spatial composition specified by a layout\ninstruction. In this work, we probe into zero-shot grounded T2I generation with\ndiffusion models, that is, generating images corresponding to the input layout\ninformation without training auxiliary modules or finetuning diffusion models.\nWe propose a Region and Boundary (R&B) aware cross-attention guidance approach\nthat gradually modulates the attention maps of diffusion model during\ngenerative process, and assists the model to synthesize images (1) with high\nfidelity, (2) highly compatible with textual input, and (3) interpreting layout\ninstructions accurately. Specifically, we leverage the discrete sampling to\nbridge the gap between consecutive attention maps and discrete layout\nconstraints, and design a region-aware loss to refine the generative layout\nduring diffusion process. We further propose a boundary-aware loss to\nstrengthen object discriminability within the corresponding regions.\nExperimental results show that our method outperforms existing state-of-the-art\nzero-shot grounded T2I generation methods by a large margin both qualitatively\nand quantitatively on several benchmarks.\n","authors":["Jiayu Xiao","Henglei Lv","Liang Li","Shuhui Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08872v5.pdf","comment":"Preprint. Under review. Project page:\n https://sagileo.github.io/Region-and-Boundary"},{"id":"http://arxiv.org/abs/2311.15619v1","updated":"2023-11-27T08:32:28Z","published":"2023-11-27T08:32:28Z","title":"Align before Adapt: Leveraging Entity-to-Region Alignments for\n Generalizable Video Action Recognition","summary":" Large-scale visual-language pre-trained models have achieved significant\nsuccess in various video tasks. However, most existing methods follow an \"adapt\nthen align\" paradigm, which adapts pre-trained image encoders to model\nvideo-level representations and utilizes one-hot or text embedding of the\naction labels for supervision. This paradigm overlooks the challenge of mapping\nfrom static images to complicated activity concepts. In this paper, we propose\na novel \"Align before Adapt\" (ALT) paradigm. Prior to adapting to video\nrepresentation learning, we exploit the entity-to-region alignments for each\nframe. The alignments are fulfilled by matching the region-aware image\nembeddings to an offline-constructed text corpus. With the aligned entities, we\nfeed their text embeddings to a transformer-based video adapter as the queries,\nwhich can help extract the semantics of the most important entities from a\nvideo to a vector. This paradigm reuses the visual-language alignment of VLP\nduring adaptation and tries to explain an action by the underlying entities.\nThis helps understand actions by bridging the gap with complex activity\nsemantics, particularly when facing unfamiliar or unseen categories. ALT\nachieves competitive performance and superior generalizability while requiring\nsignificantly low computational costs. In fully supervised scenarios, it\nachieves 88.1% top-1 accuracy on Kinetics-400 with only 4947 GFLOPs. In 2-shot\nexperiments, ALT outperforms the previous state-of-the-art by 7.1% and 9.2% on\nHMDB-51 and UCF-101, respectively.\n","authors":["Yifei Chen","Dapeng Chen","Ruijin Liu","Sai Zhou","Wenyuan Xue","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2311.15619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12092v2","updated":"2023-11-27T08:29:54Z","published":"2023-11-20T18:59:01Z","title":"Concept Sliders: LoRA Adaptors for Precise Control in Diffusion Models","summary":" We present a method to create interpretable concept sliders that enable\nprecise control over attributes in image generations from diffusion models. Our\napproach identifies a low-rank parameter direction corresponding to one concept\nwhile minimizing interference with other attributes. A slider is created using\na small set of prompts or sample images; thus slider directions can be created\nfor either textual or visual concepts. Concept Sliders are plug-and-play: they\ncan be composed efficiently and continuously modulated, enabling precise\ncontrol over image generation. In quantitative experiments comparing to\nprevious editing techniques, our sliders exhibit stronger targeted edits with\nlower interference. We showcase sliders for weather, age, styles, and\nexpressions, as well as slider compositions. We show how sliders can transfer\nlatents from StyleGAN for intuitive editing of visual concepts for which\ntextual description is difficult. We also find that our method can help address\npersistent quality issues in Stable Diffusion XL including repair of object\ndeformations and fixing distorted hands. Our code, data, and trained sliders\nare available at https://sliders.baulab.info/\n","authors":["Rohit Gandikota","Joanna Materzynska","Tingrui Zhou","Antonio Torralba","David Bau"],"pdf_url":"https://arxiv.org/pdf/2311.12092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15615v1","updated":"2023-11-27T08:25:23Z","published":"2023-11-27T08:25:23Z","title":"Technical Report for Argoverse Challenges on Unified Sensor-based\n Detection, Tracking, and Forecasting","summary":" This report presents our Le3DE2E solution for unified sensor-based detection,\ntracking, and forecasting in Argoverse Challenges at CVPR 2023 Workshop on\nAutonomous Driving (WAD). We propose a unified network that incorporates three\ntasks, including detection, tracking, and forecasting. This solution adopts a\nstrong Bird's Eye View (BEV) encoder with spatial and temporal fusion and\ngenerates unified representations for multi-tasks. The solution was tested in\nthe Argoverse 2 sensor dataset to evaluate the detection, tracking, and\nforecasting of 26 object categories. We achieved 1st place in Detection,\nTracking, and Forecasting on the E2E Forecasting track in Argoverse Challenges\nat CVPR 2023 WAD.\n","authors":["Zhepeng Wang","Feng Chen","Kanokphan Lertniphonphan","Siwei Chen","Jinyao Bao","Pengfei Zheng","Jinbao Zhang","Kaer Huang","Tao Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05646v2","updated":"2023-11-27T08:12:14Z","published":"2023-04-12T06:49:56Z","title":"Breaking Modality Disparity: Harmonized Representation for Infrared and\n Visible Image Registration","summary":" Since the differences in viewing range, resolution and relative position, the\nmulti-modality sensing module composed of infrared and visible cameras needs to\nbe registered so as to have more accurate scene perception. In practice, manual\ncalibration-based registration is the most widely used process, and it is\nregularly calibrated to maintain accuracy, which is time-consuming and\nlabor-intensive. To cope with these problems, we propose a scene-adaptive\ninfrared and visible image registration. Specifically, in regard of the\ndiscrepancy between multi-modality images, an invertible translation process is\ndeveloped to establish a modality-invariant domain, which comprehensively\nembraces the feature intensity and distribution of both infrared and visible\nmodalities. We employ homography to simulate the deformation between different\nplanes and develop a hierarchical framework to rectify the deformation inferred\nfrom the proposed latent representation in a coarse-to-fine manner. For that,\nthe advanced perception ability coupled with the residual estimation conducive\nto the regression of sparse offsets, and the alternate correlation search\nfacilitates a more accurate correspondence matching. Moreover, we propose the\nfirst ground truth available misaligned infrared and visible image dataset,\ninvolving three synthetic sets and one real-world set. Extensive experiments\nvalidate the effectiveness of the proposed method against the\nstate-of-the-arts, advancing the subsequent applications.\n","authors":["Zhiying Jiang","Zengxi Zhang","Jinyuan Liu","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2304.05646v2.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2311.15609v1","updated":"2023-11-27T08:06:56Z","published":"2023-11-27T08:06:56Z","title":"A manometric feature descriptor with linear-SVM to distinguish\n esophageal contraction vigor","summary":" n clinical, if a patient presents with nonmechanical obstructive dysphagia,\nesophageal chest pain, and gastro esophageal reflux symptoms, the physician\nwill usually assess the esophageal dynamic function. High-resolution manometry\n(HRM) is a clinically commonly used technique for detection of esophageal\ndynamic function comprehensively and objectively. However, after the results of\nHRM are obtained, doctors still need to evaluate by a variety of parameters.\nThis work is burdensome, and the process is complex. We conducted image\nprocessing of HRM to predict the esophageal contraction vigor for assisting the\nevaluation of esophageal dynamic function. Firstly, we used Feature-Extraction\nand Histogram of Gradients (FE-HOG) to analyses feature of proposal of swallow\n(PoS) to further extract higher-order features. Then we determine the\nclassification of esophageal contraction vigor normal, weak and failed by using\nlinear-SVM according to these features. Our data set includes 3000 training\nsets, 500 validation sets and 411 test sets. After verification our accuracy\nreaches 86.83%, which is higher than other common machine learning methods.\n","authors":["Jialin Liu","Lu Yan","Xiaowei Liu","Yuzhuo Dai","Fanggen Lu","Yuanting Ma","Muzhou Hou","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15607v1","updated":"2023-11-27T08:00:53Z","published":"2023-11-27T08:00:53Z","title":"Spatially Covariant Image Registration with Text Prompts","summary":" Medical images are often characterized by their structured anatomical\nrepresentations and spatially inhomogeneous contrasts. Leveraging anatomical\npriors in neural networks can greatly enhance their utility in\nresource-constrained clinical settings. Prior research has harnessed such\ninformation for image segmentation, yet progress in deformable image\nregistration has been modest. Our work introduces textSCF, a novel method that\nintegrates spatially covariant filters and textual anatomical prompts encoded\nby visual-language models, to fill this gap. This approach optimizes an\nimplicit function that correlates text embeddings of anatomical regions to\nfilter weights, relaxing the typical translation-invariance constraint of\nconvolutional operations. TextSCF not only boosts computational efficiency but\ncan also retain or improve registration accuracy. By capturing the contextual\ninterplay between anatomical regions, it offers impressive inter-regional\ntransferability and the ability to preserve structural discontinuities during\nregistration. TextSCF's performance has been rigorously tested on inter-subject\nbrain MRI and abdominal CT registration tasks, outperforming existing\nstate-of-the-art models in the MICCAI Learn2Reg 2021 challenge and leading the\nleaderboard. In abdominal registrations, textSCF's larger model variant\nimproved the Dice score by 11.3% over the second-best model, while its smaller\nvariant maintained similar accuracy but with an 89.13% reduction in network\nparameters and a 98.34\\% decrease in computational operations.\n","authors":["Hang Zhang","Xiang Chen","Rongguang Wang","Renjiu Hu","Dongdong Liu","Gaolei Li"],"pdf_url":"https://arxiv.org/pdf/2311.15607v1.pdf","comment":"15 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.15605v1","updated":"2023-11-27T07:57:29Z","published":"2023-11-27T07:57:29Z","title":"2D Feature Distillation for Weakly- and Semi-Supervised 3D Semantic\n Segmentation","summary":" As 3D perception problems grow in popularity and the need for large-scale\nlabeled datasets for LiDAR semantic segmentation increase, new methods arise\nthat aim to reduce the necessity for dense annotations by employing\nweakly-supervised training. However these methods continue to show weak\nboundary estimation and high false negative rates for small objects and distant\nsparse regions. We argue that such weaknesses can be compensated by using RGB\nimages which provide a denser representation of the scene. We propose an\nimage-guidance network (IGNet) which builds upon the idea of distilling high\nlevel feature information from a domain adapted synthetically trained 2D\nsemantic segmentation network. We further utilize a one-way contrastive\nlearning scheme alongside a novel mixing strategy called FOVMix, to combat the\nhorizontal field-of-view mismatch between the two sensors and enhance the\neffects of image guidance. IGNet achieves state-of-the-art results for\nweakly-supervised LiDAR semantic segmentation on ScribbleKITTI, boasting up to\n98% relative performance to fully supervised training with only 8% labeled\npoints, while introducing no additional annotation burden or\ncomputational/memory cost during inference. Furthermore, we show that our\ncontributions also prove effective for semi-supervised training, where IGNet\nclaims state-of-the-art results on both ScribbleKITTI and SemanticKITTI.\n","authors":["Ozan Unal","Dengxin Dai","Lukas Hoyer","Yigit Baran Can","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.15605v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2311.15599v1","updated":"2023-11-27T07:48:50Z","published":"2023-11-27T07:48:50Z","title":"UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio,\n Video, Point Cloud, Time-Series and Image Recognition","summary":" Large-kernel convolutional neural networks (ConvNets) have recently received\nextensive research attention, but there are two unresolved and critical issues\nthat demand further investigation. 1) The architectures of existing\nlarge-kernel ConvNets largely follow the design principles of conventional\nConvNets or transformers, while the architectural design for large-kernel\nConvNets remains under-addressed. 2) As transformers have dominated multiple\nmodalities, it remains to be investigated whether ConvNets also have a strong\nuniversal perception ability in domains beyond vision. In this paper, we\ncontribute from two aspects. 1) We propose four architectural guidelines for\ndesigning large-kernel ConvNets, the core of which is to exploit the essential\ncharacteristics of large kernels that distinguish them from small kernels -\nthey can see wide without going deep. Following such guidelines, our proposed\nlarge-kernel ConvNet shows leading performance in image recognition. For\nexample, our models achieve an ImageNet accuracy of 88.0%, ADE20K mIoU of\n55.6%, and COCO box AP of 56.4%, demonstrating better performance and higher\nspeed than a number of recently proposed powerful competitors. 2) We discover\nthat large kernels are the key to unlocking the exceptional performance of\nConvNets in domains where they were originally not proficient. With certain\nmodality-related preprocessing approaches, the proposed model achieves\nstate-of-the-art performance on time-series forecasting and audio recognition\ntasks even without modality-specific customization to the architecture. Code\nand all the models at https://github.com/AILab-CVC/UniRepLKNet.\n","authors":["Xiaohan Ding","Yiyuan Zhang","Yixiao Ge","Sijie Zhao","Lin Song","Xiangyu Yue","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2311.15599v1.pdf","comment":"Code, all the models and reproducible training scripts at\n https://github.com/AILab-CVC/UniRepLKNet"},{"id":"http://arxiv.org/abs/2311.15596v1","updated":"2023-11-27T07:44:25Z","published":"2023-11-27T07:44:25Z","title":"Can Vision-Language Models Think from a First-Person Perspective?","summary":" Vision-language models (VLMs) have recently shown promising results in\ntraditional downstream tasks. Evaluation studies have emerged to assess their\nabilities, with the majority focusing on the third-person perspective, and only\na few addressing specific tasks from the first-person perspective. However, the\ncapability of VLMs to \"think\" from a first-person perspective, a crucial\nattribute for advancing autonomous agents and robotics, remains largely\nunexplored. To bridge this research gap, we introduce EgoThink, a novel visual\nquestion-answering benchmark that encompasses six core capabilities with twelve\ndetailed dimensions. The benchmark is constructed using selected clips from\negocentric videos, with manually annotated question-answer pairs containing\nfirst-person information. To comprehensively assess VLMs, we evaluate eighteen\npopular VLMs on EgoThink. Moreover, given the open-ended format of the answers,\nwe use GPT-4 as the automatic judge to compute single-answer grading.\nExperimental results indicate that although GPT-4V leads in numerous\ndimensions, all evaluated VLMs still possess considerable potential for\nimprovement in first-person perspective tasks. Meanwhile, enlarging the number\nof trainable parameters has the most significant impact on model performance on\nEgoThink. In conclusion, EgoThink serves as a valuable addition to existing\nevaluation benchmarks for VLMs, providing an indispensable resource for future\nresearch in the realm of embodied artificial intelligence and robotics.\n","authors":["Sijie Cheng","Zhicheng Guo","Jingwen Wu","Kechen Fang","Peng Li","Huaping Liu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15586v1","updated":"2023-11-27T07:24:50Z","published":"2023-11-27T07:24:50Z","title":"An Ensemble of 2.5D ResUnet Based Models for Segmentation for Kidney and\n Masses","summary":" The automatic segmentation of kidney, kidney tumor and kidney cyst on\nComputed Tomography (CT) scans is a challenging task due to the indistinct\nlesion boundaries and fuzzy texture. Considering the large range and unbalanced\ndistribution of CT scans' thickness, 2.5D ResUnet are adopted to build an\nefficient coarse-to-fine semantic segmentation framework in this work. A set of\n489 CT scans are used for training and validation, and an independent\nnever-before-used CT scans for testing. Finally, we demonstrate the\neffectiveness of our proposed method. The dice values on test set are 0.954,\n0.792, 0.691, the surface dice values are 0.897, 0.591, 0.541 for kidney, tumor\nand cyst, respectively. The average inference time of each CT scan is 20.65s\nand the max GPU memory is 3525MB. The results suggest that a better trade-off\nbetween model performance and efficiency.\n","authors":["Cancan Chen"," RongguoZhang"],"pdf_url":"https://arxiv.org/pdf/2311.15586v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.15584v1","updated":"2023-11-27T07:19:41Z","published":"2023-11-27T07:19:41Z","title":"A deep learning approach for marine snow synthesis and removal","summary":" Marine snow, the floating particles in underwater images, severely degrades\nthe visibility and performance of human and machine vision systems. This paper\nproposes a novel method to reduce the marine snow interference using deep\nlearning techniques. We first synthesize realistic marine snow samples by\ntraining a Generative Adversarial Network (GAN) model and combine them with\nnatural underwater images to create a paired dataset. We then train a U-Net\nmodel to perform marine snow removal as an image to image translation task. Our\nexperiments show that the U-Net model can effectively remove both synthetic and\nnatural marine snow with high accuracy, outperforming state-of-the-art methods\nsuch as the Median filter and its adaptive variant. We also demonstrate the\nrobustness of our method by testing it on the MSRB dataset, which contains\nsynthetic artifacts that our model has not seen during training. Our method is\na practical and efficient solution for enhancing underwater images affected by\nmarine snow.\n","authors":["Fernando Galetto","Guang Deng"],"pdf_url":"https://arxiv.org/pdf/2311.15584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15581v1","updated":"2023-11-27T07:19:10Z","published":"2023-11-27T07:19:10Z","title":"Real Time GAZED: Online Shot Selection and Editing of Virtual Cameras\n from Wide-Angle Monocular Video Recordings","summary":" Eliminating time-consuming post-production processes and delivering\nhigh-quality videos in today's fast-paced digital landscape are the key\nadvantages of real-time approaches. To address these needs, we present Real\nTime GAZED: a real-time adaptation of the GAZED framework integrated with\nCineFilter, a novel real-time camera trajectory stabilization approach. It\nenables users to create professionally edited videos in real-time. Comparative\nevaluations against baseline methods, including the non-real-time GAZED,\ndemonstrate that Real Time GAZED achieves similar editing results, ensuring\nhigh-quality video output. Furthermore, a user study confirms the aesthetic\nquality of the video edits produced by the Real Time GAZED approach. With these\nadvancements in real-time camera trajectory optimization and video editing\npresented, the demand for immediate and dynamic content creation in industries\nsuch as live broadcasting, sports coverage, news reporting, and social media\ncontent creation can be met more efficiently.\n","authors":["Sudheer Achary","Rohit Girmaji","Adhiraj Anil Deshmukh","Vineet Gandhi"],"pdf_url":"https://arxiv.org/pdf/2311.15581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15573v1","updated":"2023-11-27T06:55:53Z","published":"2023-11-27T06:55:53Z","title":"EucliDreamer: Fast and High-Quality Texturing for 3D Models with Stable\n Diffusion Depth","summary":" This paper presents a novel method to generate textures for 3D models given\ntext prompts and 3D meshes. Additional depth information is taken into account\nto perform the Score Distillation Sampling (SDS) process [28] with depth\nconditional Stable Diffusion [34]. We ran our model over the open-source\ndataset Objaverse [7] and conducted a user study to compare the results with\nthose of various 3D texturing methods. We have shown that our model can\ngenerate more satisfactory results and produce various art styles for the same\nobject. In addition, we achieved faster time when generating textures of\ncomparable quality. We also conduct thorough ablation studies of how different\nfactors may affect generation quality, including sampling steps, guidance\nscale, negative prompts, data augmentation, elevation range, and alternatives\nto SDS.\n","authors":["Cindy Le","Congrui Hetang","Ang Cao","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2311.15573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11317v3","updated":"2023-11-27T06:52:44Z","published":"2023-11-19T13:07:06Z","title":"Discrete approximations of Gaussian smoothing and Gaussian derivatives","summary":" This paper develops an in-depth treatment concerning the problem of\napproximating the Gaussian smoothing and Gaussian derivative computations in\nscale-space theory for application on discrete data. With close connections to\nprevious axiomatic treatments of continuous and discrete scale-space theory, we\nconsider three main ways discretizing these scale-space operations in terms of\nexplicit discrete convolutions, based on either (i) sampling the Gaussian\nkernels and the Gaussian derivative kernels, (ii) locally integrating the\nGaussian kernels and the Gaussian derivative kernels over each pixel support\nregion and (iii) basing the scale-space analysis on the discrete analogue of\nthe Gaussian kernel, and then computing derivative approximations by applying\nsmall-support central difference operators to the spatially smoothed image\ndata.\n We study the properties of these three main discretization methods both\ntheoretically and experimentally, and characterize their performance by\nquantitative measures, including the results they give rise to with respect to\nthe task of scale selection, investigated for four different use cases, and\nwith emphasis on the behaviour at fine scales. The results show that the\nsampled Gaussian kernels and derivatives as well as the integrated Gaussian\nkernels and derivatives perform very poorly at very fine scales. At very fine\nscales, the discrete analogue of the Gaussian kernel with its corresponding\ndiscrete derivative approximations performs substantially better. The sampled\nGaussian kernel and the sampled Gaussian derivatives do, on the other hand,\nlead to numerically very good approximations of the corresponding continuous\nresults, when the scale parameter is sufficiently large, in the experiments\npresented in the paper, when the scale parameter is greater than a value of\nabout 1, in units of the grid spacing.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.11317v3.pdf","comment":"38 pages, 34 figures"},{"id":"http://arxiv.org/abs/2311.15571v1","updated":"2023-11-27T06:45:22Z","published":"2023-11-27T06:45:22Z","title":"Video-based Visible-Infrared Person Re-Identification with Auxiliary\n Samples","summary":" Visible-infrared person re-identification (VI-ReID) aims to match persons\ncaptured by visible and infrared cameras, allowing person retrieval and\ntracking in 24-hour surveillance systems. Previous methods focus on learning\nfrom cross-modality person images in different cameras. However, temporal\ninformation and single-camera samples tend to be neglected. To crack this nut,\nin this paper, we first contribute a large-scale VI-ReID dataset named\nBUPTCampus. Different from most existing VI-ReID datasets, it 1) collects\ntracklets instead of images to introduce rich temporal information, 2) contains\npixel-aligned cross-modality sample pairs for better modality-invariant\nlearning, 3) provides one auxiliary set to help enhance the optimization, in\nwhich each identity only appears in a single camera. Based on our constructed\ndataset, we present a two-stream framework as baseline and apply Generative\nAdversarial Network (GAN) to narrow the gap between the two modalities. To\nexploit the advantages introduced by the auxiliary set, we propose a curriculum\nlearning based strategy to jointly learn from both primary and auxiliary sets.\nMoreover, we design a novel temporal k-reciprocal re-ranking method to refine\nthe ranking list with fine-grained temporal correlation cues. Experimental\nresults demonstrate the effectiveness of the proposed methods. We also\nreproduce 9 state-of-the-art image-based and video-based VI-ReID methods on\nBUPTCampus and our methods show substantial superiority to them. The codes and\ndataset are available at: https://github.com/dyhBUPT/BUPTCampus.\n","authors":["Yunhao Du","Cheng Lei","Zhicheng Zhao","Yuan Dong","Fei Su"],"pdf_url":"https://arxiv.org/pdf/2311.15571v1.pdf","comment":"Accepted by Transactions on Information Forensics & Security 2023"},{"id":"http://arxiv.org/abs/2311.15570v1","updated":"2023-11-27T06:38:07Z","published":"2023-11-27T06:38:07Z","title":"UFDA: Universal Federated Domain Adaptation with Practical Assumptions","summary":" Conventional Federated Domain Adaptation (FDA) approaches usually demand an\nabundance of assumptions, such as label set consistency, which makes them\nsignificantly less feasible for real-world situations and introduces security\nhazards. In this work, we propose a more practical scenario named Universal\nFederated Domain Adaptation (UFDA). It only requires the black-box model and\nthe label set information of each source domain, while the label sets of\ndifferent source domains could be inconsistent and the target-domain label set\nis totally blind. This relaxes the assumptions made by FDA, which are often\nchallenging to meet in real-world cases and diminish model security. To address\nthe UFDA scenario, we propose a corresponding framework called Hot-Learning\nwith Contrastive Label Disambiguation (HCLD), which tackles UFDA's domain\nshifts and category gaps problem by using one-hot outputs from the black-box\nmodels of various source domains. Moreover, to better distinguish the shared\nand unknown classes, we further present a cluster-level strategy named\nMutual-Voting Decision (MVD) to extract robust consensus knowledge across peer\nclasses from both source and target domains. The extensive experiments on three\nbenchmarks demonstrate that our HCLD achieves comparable performance for our\nUFDA scenario with much fewer assumptions, compared to the previous\nmethodologies with many additional assumptions.\n","authors":["Xinhui Liu","Zhenghao Chen","Luping Zhou","Dong Xu","Wei Xi","Gairui Bai","Yihan Zhao","Jizhong Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.15570v1.pdf","comment":"Submitted to AAAI2024"},{"id":"http://arxiv.org/abs/2311.15569v1","updated":"2023-11-27T06:37:05Z","published":"2023-11-27T06:37:05Z","title":"Improving Adaptability and Generalizability of Efficient Transfer\n Learning for Vision-Language Models","summary":" Vision-Language Models (VLMs) like CLIP have demonstrated remarkable\napplicability across a variety of downstream tasks, including zero-shot image\nclassification. Recently, the use of prompts or adapters for efficient transfer\nlearning has gained significant attention for effectively adapting to\ndownstream tasks. However, the roles of vision and text prompts, as well as\nadapters in terms of generalization and transfer difficulty, have been\noverlooked, limiting performance on unseen tasks. In this paper, we empirically\nanalyze how VLMs behave when using vision and text prompts, adapters, and a\ncombination of these components, marking a novel exploration by our study. Our\nobservations find that utilizing vision prompts for class separability and text\nadapters for task adaptation is crucial for adaptability and generalizability.\nMoreover, to improve generalization across every domain, we propose an adaptive\nensemble method that effectively combines the general knowledge of VLMs with\ntask-specific knowledge according to transfer difficulty. Upon experimenting\nwith extensive benchmarks, our method consistently outperforms all baselines,\nparticularly on unseen tasks, demonstrating the effectiveness of our proposed\napproach.\n","authors":["Yongjin Yang","Jongwoo Ko","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2311.15569v1.pdf","comment":"11 pages (19 pages including supplementary), 10 figures (12 figures\n including supplementary), 6 tables (17 tables including supplementary)"},{"id":"http://arxiv.org/abs/2310.17294v3","updated":"2023-11-27T06:21:03Z","published":"2023-10-26T10:18:51Z","title":"Scale-Adaptive Feature Aggregation for Efficient Space-Time Video\n Super-Resolution","summary":" The Space-Time Video Super-Resolution (STVSR) task aims to enhance the visual\nquality of videos, by simultaneously performing video frame interpolation (VFI)\nand video super-resolution (VSR). However, facing the challenge of the\nadditional temporal dimension and scale inconsistency, most existing STVSR\nmethods are complex and inflexible in dynamically modeling different motion\namplitudes. In this work, we find that choosing an appropriate processing scale\nachieves remarkable benefits in flow-based feature propagation. We propose a\nnovel Scale-Adaptive Feature Aggregation (SAFA) network that adaptively selects\nsub-networks with different processing scales for individual samples.\nExperiments on four public STVSR benchmarks demonstrate that SAFA achieves\nstate-of-the-art performance. Our SAFA network outperforms recent\nstate-of-the-art methods such as TMNet and VideoINR by an average improvement\nof over 0.5dB on PSNR, while requiring less than half the number of parameters\nand only 1/3 computational costs.\n","authors":["Zhewei Huang","Ailin Huang","Xiaotao Hu","Chen Hu","Jun Xu","Shuchang Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.17294v3.pdf","comment":"WACV2024, 16 pages"},{"id":"http://arxiv.org/abs/2311.15562v1","updated":"2023-11-27T06:19:00Z","published":"2023-11-27T06:19:00Z","title":"Fully Authentic Visual Question Answering Dataset from Online\n Communities","summary":" Visual Question Answering (VQA) entails answering questions about images. We\nintroduce the first VQA dataset in which all contents originate from an\nauthentic use case. Sourced from online question answering community forums, we\ncall it VQAonline. We then characterize our dataset and how it relates to eight\nother VQA datasets. Observing that answers in our dataset tend to be much\nlonger (e.g., with a mean of 173 words) and thus incompatible with standard VQA\nevaluation metrics, we next analyze which of the six popular metrics for longer\ntext evaluation align best with human judgments. We then use the best-suited\nmetrics to evaluate six state-of-the-art vision and language foundation models\non VQAonline and reveal where they struggle most. We will release the dataset\nsoon to facilitate future extensions.\n","authors":["Chongyan Chen","Mengchen Liu","Noel Codella","Yunsheng Li","Lu Yuan","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2311.15562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15561v1","updated":"2023-11-27T06:14:23Z","published":"2023-11-27T06:14:23Z","title":"ET3D: Efficient Text-to-3D Generation via Multi-View Distillation","summary":" Recent breakthroughs in text-to-image generation has shown encouraging\nresults via large generative models. Due to the scarcity of 3D assets, it is\nhardly to transfer the success of text-to-image generation to that of\ntext-to-3D generation. Existing text-to-3D generation methods usually adopt the\nparadigm of DreamFusion, which conducts per-asset optimization by distilling a\npretrained text-to-image diffusion model. The generation speed usually ranges\nfrom several minutes to tens of minutes per 3D asset, which degrades the user\nexperience and also imposes a burden to the service providers due to the high\ncomputational budget.\n In this work, we present an efficient text-to-3D generation method, which\nrequires only around 8 $ms$ to generate a 3D asset given the text prompt on a\nconsumer graphic card. The main insight is that we exploit the images generated\nby a large pre-trained text-to-image diffusion model, to supervise the training\nof a text conditioned 3D generative adversarial network. Once the network is\ntrained, we are able to efficiently generate a 3D asset via a single forward\npass. Our method requires no 3D training data and provides an alternative\napproach for efficient text-to-3D generation by distilling pre-trained image\ndiffusion models.\n","authors":["Yiming Chen","Zhiqi Li","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12386v2","updated":"2023-11-27T05:58:45Z","published":"2023-11-21T06:55:21Z","title":"Point, Segment and Count: A Generalized Framework for Object Counting","summary":" Class-agnostic object counting aims to count all objects in an image with\nrespect to example boxes or class names, \\emph{a.k.a} few-shot and zero-shot\ncounting. Current state-of-the-art methods highly rely on density maps to\npredict object counts, which lacks model interpretability. In this paper, we\npropose a generalized framework for both few-shot and zero-shot object counting\nbased on detection. Our framework combines the superior advantages of two\nfoundation models without compromising their zero-shot capability: (\\textbf{i})\nSAM to segment all possible objects as mask proposals, and (\\textbf{ii}) CLIP\nto classify proposals to obtain accurate object counts. However, this strategy\nmeets the obstacles of efficiency overhead and the small crowded objects that\ncannot be localized and distinguished. To address these issues, our framework,\ntermed PseCo, follows three steps: point, segment, and count. Specifically, we\nfirst propose a class-agnostic object localization to provide accurate but\nleast point prompts for SAM, which consequently not only reduces computation\ncosts but also avoids missing small objects. Furthermore, we propose a\ngeneralized object classification that leverages CLIP image/text embeddings as\nthe classifier, following a hierarchical knowledge distillation to obtain\ndiscriminative classifications among hierarchical mask proposals. Extensive\nexperimental results on FSC-147 dataset demonstrate that PseCo achieves\nstate-of-the-art performance in both few-shot/zero-shot object\ncounting/detection, with additional results on large-scale COCO and LVIS\ndatasets. The source code is available at\n\\url{https://github.com/Hzzone/PseCo}.\n","authors":["Zhizhong Huang","Mingliang Dai","Yi Zhang","Junping Zhang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2311.12386v2.pdf","comment":"Fix typos"},{"id":"http://arxiv.org/abs/2311.15556v1","updated":"2023-11-27T05:53:03Z","published":"2023-11-27T05:53:03Z","title":"PKU-I2IQA: An Image-to-Image Quality Assessment Database for AI\n Generated Images","summary":" With the development of image generation technology, AI-based image\ngeneration has been applied in various fields. However, the development of AIGC\nimage generative models also brings new problems and challenges. A significant\nchallenge is that AI-generated images (AIGI) compared to natural images may\nhave some unique distortions, and not all generated images meet the\nrequirements of the real world, so it is of great significance to evaluate\nAI-generated images more comprehensively. Although previous work has\nestablished some human perception-based AIGC image quality assessment databases\nfor text-generated images, the AI image generation technology includes\nscenarios like text-to-image and image-to-image, and assessing only the images\ngenerated by text-to-image models is insufficient. To address this issue, we\nhave established a human perception-based image-to-image AIGC image quality\nassessment database, named PKU-I2IQA. We conducted a comprehensive analysis of\nthe PKU-I2IQA database. Furthermore, we introduced two benchmark models:\nNR-AIGCIQA based on no-reference image quality assessment and FR-AIGCIQA based\non full-reference image quality assessment.Finally, leveraging this database,\nwe conducted benchmark experiments and compared the performance of the proposed\nbenchmark models. The PKU-I2IQA database and benchmarks will be released to\nfacilitate future research on https://github.com/jiquan123/I2IQA.\n Keywords: AIGC, image-to-image generation, image quality assessment,\nNR-AIGCIQA, FR-AIGCIQA\n","authors":["Jiquan Yuan","Xinyan Cao","Changjin Li","Fanyi Yang","Jinlong Lin","Xixin Cao"],"pdf_url":"https://arxiv.org/pdf/2311.15556v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2311.15551v1","updated":"2023-11-27T05:35:49Z","published":"2023-11-27T05:35:49Z","title":"Instruct2Attack: Language-Guided Semantic Adversarial Attacks","summary":" We propose Instruct2Attack (I2A), a language-guided semantic attack that\ngenerates semantically meaningful perturbations according to free-form language\ninstructions. We make use of state-of-the-art latent diffusion models, where we\nadversarially guide the reverse diffusion process to search for an adversarial\nlatent code conditioned on the input image and text instruction. Compared to\nexisting noise-based and semantic attacks, I2A generates more natural and\ndiverse adversarial examples while providing better controllability and\ninterpretability. We further automate the attack process with GPT-4 to generate\ndiverse image-specific text instructions. We show that I2A can successfully\nbreak state-of-the-art deep neural networks even under strong adversarial\ndefenses, and demonstrate great transferability among a variety of network\narchitectures.\n","authors":["Jiang Liu","Chen Wei","Yuxiang Guo","Heng Yu","Alan Yuille","Soheil Feizi","Chun Pong Lau","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2311.15551v1.pdf","comment":"under submission, code coming soon"},{"id":"http://arxiv.org/abs/2311.15547v1","updated":"2023-11-27T05:23:01Z","published":"2023-11-27T05:23:01Z","title":"Dataset Distillation in Latent Space","summary":" Dataset distillation (DD) is a newly emerging research area aiming at\nalleviating the heavy computational load in training models on large datasets.\nIt tries to distill a large dataset into a small and condensed one so that\nmodels trained on the distilled dataset can perform comparably with those\ntrained on the full dataset when performing downstream tasks. Among the\nprevious works in this area, there are three key problems that hinder the\nperformance and availability of the existing DD methods: high time complexity,\nhigh space complexity, and low info-compactness. In this work, we\nsimultaneously attempt to settle these three problems by moving the DD\nprocesses from conventionally used pixel space to latent space. Encoded by a\npretrained generic autoencoder, latent codes in the latent space are naturally\ninfo-compact representations of the original images in much smaller sizes.\nAfter transferring three mainstream DD algorithms to latent space, we\nsignificantly reduce time and space consumption while achieving similar\nperformance, allowing us to distill high-resolution datasets or target at\ngreater data ratio that previous methods have failed. Besides, within the same\nstorage budget, we can also quantitatively deliver more latent codes than\npixel-level images, which further boosts the performance of our methods.\n","authors":["Yuxuan Duan","Jianfu Zhang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15547v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.15543v1","updated":"2023-11-27T05:20:11Z","published":"2023-11-27T05:20:11Z","title":"Beyond Pixels: Exploring Human-Readable SVG Generation for Simple Images\n with Vision Language Models","summary":" In the field of computer graphics, the use of vector graphics, particularly\nScalable Vector Graphics (SVG), represents a notable development from\ntraditional pixel-based imagery. SVGs, with their XML-based format, are\ndistinct in their ability to directly and explicitly represent visual elements\nsuch as shape, color, and path. This direct representation facilitates a more\naccurate and logical depiction of graphical elements, enhancing reasoning and\ninterpretability. Recognizing the potential of SVGs, the machine learning\ncommunity has introduced multiple methods for image vectorization. However,\ntransforming images into SVG format while retaining the relational properties\nand context of the original scene remains a key challenge. Most vectorization\nmethods often yield SVGs that are overly complex and not easily interpretable.\nIn response to this challenge, we introduce our method, Simple-SVG-Generation\n(S\\textsuperscript{2}VG\\textsuperscript{2}). Our method focuses on producing\nSVGs that are both accurate and simple, aligning with human readability and\nunderstanding. With simple images, we evaluate our method with reasoning tasks\ntogether with advanced language models, the results show a clear improvement\nover previous SVG generation methods. We also conducted surveys for human\nevaluation on the readability of our generated SVGs, the results also favor our\nmethods.\n","authors":["Tong Zhang","Haoyang Liu","Peiyan Zhang","Yuxuan Cheng","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15543v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.15540v1","updated":"2023-11-27T05:10:15Z","published":"2023-11-27T05:10:15Z","title":"EAFP-Med: An Efficient Adaptive Feature Processing Module Based on\n Prompts for Medical Image Detection","summary":" In the face of rapid advances in medical imaging, cross-domain adaptive\nmedical image detection is challenging due to the differences in lesion\nrepresentations across various medical imaging technologies. To address this\nissue, we draw inspiration from large language models to propose EAFP-Med, an\nefficient adaptive feature processing module based on prompts for medical image\ndetection. EAFP-Med can efficiently extract lesion features of different scales\nfrom a diverse range of medical images based on prompts while being flexible\nand not limited by specific imaging techniques. Furthermore, it serves as a\nfeature preprocessing module that can be connected to any model front-end to\nenhance the lesion features in input images. Moreover, we propose a novel\nadaptive disease detection model named EAFP-Med ST, which utilizes the Swin\nTransformer V2 - Tiny (SwinV2-T) as its backbone and connects it to EAFP-Med.\nWe have compared our method to nine state-of-the-art methods. Experimental\nresults demonstrate that EAFP-Med ST achieves the best performance on all three\ndatasets (chest X-ray images, cranial magnetic resonance imaging images, and\nskin images). EAFP-Med can efficiently extract lesion features from various\nmedical images based on prompts, enhancing the model's performance. This holds\nsignificant potential for improving medical image analysis and diagnosis.\n","authors":["Xiang Li","Long Lan","Husam Lahza","Shaowu Yang","Shuihua Wang","Wenjing Yang","Hengzhu Liu","Yudong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09777v2","updated":"2023-11-27T05:09:29Z","published":"2023-09-18T13:58:42Z","title":"DriveDreamer: Towards Real-world-driven World Models for Autonomous\n Driving","summary":" World models, especially in autonomous driving, are trending and drawing\nextensive attention due to their capacity for comprehending driving\nenvironments. The established world model holds immense potential for the\ngeneration of high-quality driving videos, and driving policies for safe\nmaneuvering. However, a critical limitation in relevant research lies in its\npredominant focus on gaming environments or simulated settings, thereby lacking\nthe representation of real-world driving scenarios. Therefore, we introduce\nDriveDreamer, a pioneering world model entirely derived from real-world driving\nscenarios. Regarding that modeling the world in intricate driving scenes\nentails an overwhelming search space, we propose harnessing the powerful\ndiffusion model to construct a comprehensive representation of the complex\nenvironment. Furthermore, we introduce a two-stage training pipeline. In the\ninitial phase, DriveDreamer acquires a deep understanding of structured traffic\nconstraints, while the subsequent stage equips it with the ability to\nanticipate future states. The proposed DriveDreamer is the first world model\nestablished from real-world driving scenarios. We instantiate DriveDreamer on\nthe challenging nuScenes benchmark, and extensive experiments verify that\nDriveDreamer empowers precise, controllable video generation that faithfully\ncaptures the structural constraints of real-world traffic scenarios.\nAdditionally, DriveDreamer enables the generation of realistic and reasonable\ndriving policies, opening avenues for interaction and practical applications.\n","authors":["Xiaofeng Wang","Zheng Zhu","Guan Huang","Xinze Chen","Jiagang Zhu","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2309.09777v2.pdf","comment":"Project Page: https://drivedreamer.github.io"},{"id":"http://arxiv.org/abs/2311.15537v1","updated":"2023-11-27T05:00:38Z","published":"2023-11-27T05:00:38Z","title":"SED: A Simple Encoder-Decoder for Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation strives to distinguish pixels into\ndifferent semantic groups from an open set of categories. Most existing methods\nexplore utilizing pre-trained vision-language models, in which the key is to\nadopt the image-level model for pixel-level segmentation task. In this paper,\nwe propose a simple encoder-decoder, named SED, for open-vocabulary semantic\nsegmentation, which comprises a hierarchical encoder-based cost map generation\nand a gradual fusion decoder with category early rejection. The hierarchical\nencoder-based cost map generation employs hierarchical backbone, instead of\nplain transformer, to predict pixel-level image-text cost map. Compared to\nplain transformer, hierarchical backbone better captures local spatial\ninformation and has linear computational complexity with respect to input size.\nOur gradual fusion decoder employs a top-down structure to combine cost map and\nthe feature maps of different backbone levels for segmentation. To accelerate\ninference speed, we introduce a category early rejection scheme in the decoder\nthat rejects many no-existing categories at the early layer of decoder,\nresulting in at most 4.7 times acceleration without accuracy degradation.\nExperiments are performed on multiple open-vocabulary semantic segmentation\ndatasets, which demonstrates the efficacy of our SED method. When using\nConvNeXt-B, our SED method achieves mIoU score of 31.6\\% on ADE20K with 150\ncategories at 82 millisecond ($ms$) per image on a single A6000. We will\nrelease it at \\url{https://github.com/xb534/SED.git}.\n","authors":["Bin Xie","Jiale Cao","Jin Xie","Fahad Shahbaz Khan","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2311.15537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15536v1","updated":"2023-11-27T04:49:24Z","published":"2023-11-27T04:49:24Z","title":"SVRDA: A Web-based Dataset Annotation Tool for Slice-to-Volume\n Registration","summary":" Background and Objective: The lack of benchmark datasets has impeded the\ndevelopment of slice-to-volume registration algorithms. Such datasets are\ndifficult to annotate, primarily due to the dimensional difference within data\nand the dearth of task-specific software. We aim to develop a user-friendly\ntool to streamline dataset annotation for slice-to-volume registration.\n Methods: The proposed tool, named SVRDA, is an installation-free web\napplication for platform-agnostic collaborative dataset annotation. It enables\nefficient transformation manipulation via keyboard shortcuts and smooth case\ntransitions with auto-saving. SVRDA supports configuration-based data loading\nand adheres to the separation of concerns, offering great flexibility and\nextensibility for future research. Various supplementary features have been\nimplemented to facilitate slice-to-volume registration.\n Results: We validated the effectiveness of SVRDA by indirectly evaluating the\npost-registration segmentation quality on UK Biobank data, observing a dramatic\noverall improvement (24.02% in the Dice Similarity Coefficient and 48.93% in\nthe 95th percentile Hausdorff distance, respectively) supported by highly\nstatistically significant evidence ($p<0.001$).We further showcased the\nclinical usage of SVRDA by integrating it into test-retest T1 quantification on\nin-house magnetic resonance images, leading to more consistent results after\nregistration.\n Conclusions: SVRDA can facilitate collaborative annotation of benchmark\ndatasets while being potentially applicable to other pipelines incorporating\nslice-to-volume registration. Full source code and documentation are available\nat https://github.com/Roldbach/SVRDA\n","authors":["Weixun Luo","Alexandre Triay Bagur","Paul Aljabar","George Ralli","Sir Michael Brady"],"pdf_url":"https://arxiv.org/pdf/2311.15536v1.pdf","comment":"18 pages, 11 figures, In submission to Computer Methods and Programs\n in Biomedicine"},{"id":"http://arxiv.org/abs/2310.01852v6","updated":"2023-11-27T04:28:58Z","published":"2023-10-03T07:33:27Z","title":"LanguageBind: Extending Video-Language Pretraining to N-modality by\n Language-based Semantic Alignment","summary":" The video-language (VL) pretraining has achieved remarkable improvement in\nmultiple downstream tasks. However, the current VL pretraining framework is\nhard to extend to multiple modalities (N modalities, N>=3) beyond vision and\nlanguage. We thus propose LanguageBind, taking the language as the bind across\ndifferent modalities because the language modality is well-explored and\ncontains rich semantics. Specifically, we freeze the language encoder acquired\nby VL pretraining, then train encoders for other modalities with contrastive\nlearning. As a result, all modalities are mapped to a shared feature space,\nimplementing multi-modal semantic alignment. While LanguageBind ensures that we\ncan extend VL modalities to N modalities, we also need a high-quality dataset\nwith alignment data pairs centered on language. We thus propose VIDAL-10M with\nVideo, Infrared, Depth, Audio and their corresponding Language, naming as\nVIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with\ncomplete semantics rather than truncated segments from long videos, and all the\nvideo, depth, infrared, and audio modalities are aligned to their textual\ndescriptions. After pretraining on VIDAL-10M, we outperform ImageBind by 5.8%\nR@1 on the MSR-VTT dataset with only 15% of the parameters in the zero-shot\nvideo-text retrieval task. Beyond this, our LanguageBind has greatly improved\nin the zero-shot video, audio, depth, and infrared understanding tasks. For\ninstance, LanguageBind surpassing InterVideo by 1.9% on MSR-VTT, 8.8% on MSVD,\n6.3% on DiDeMo, and 4.4% on ActivityNet. On the LLVIP and NYU-D datasets,\nLanguageBind outperforms ImageBind with 23.8% and 11.1% top-1 accuracy. Code\naddress: https://github.com/PKU-YuanGroup/LanguageBind.\n","authors":["Bin Zhu","Bin Lin","Munan Ning","Yang Yan","Jiaxi Cui","HongFa Wang","Yatian Pang","Wenhao Jiang","Junwu Zhang","Zongwei Li","Wancai Zhang","Zhifeng Li","Wei Liu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.01852v6.pdf","comment":"Under review as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2310.18332v2","updated":"2023-11-27T04:22:54Z","published":"2023-10-20T12:44:44Z","title":"WordArt Designer: User-Driven Artistic Typography Synthesis using Large\n Language Models","summary":" This paper introduces WordArt Designer, a user-driven framework for artistic\ntypography synthesis, relying on the Large Language Model (LLM). The system\nincorporates four key modules: the LLM Engine, SemTypo, StyTypo, and TexTypo\nmodules. 1) The LLM Engine, empowered by the LLM (e.g., GPT-3.5), interprets\nuser inputs and generates actionable prompts for the other modules, thereby\ntransforming abstract concepts into tangible designs. 2) The SemTypo module\noptimizes font designs using semantic concepts, striking a balance between\nartistic transformation and readability. 3) Building on the semantic layout\nprovided by the SemTypo module, the StyTypo module creates smooth, refined\nimages. 4) The TexTypo module further enhances the design's aesthetics through\ntexture rendering, enabling the generation of inventive textured fonts.\nNotably, WordArt Designer highlights the fusion of generative AI with artistic\ntypography. Experience its capabilities on ModelScope:\nhttps://www.modelscope.cn/studios/WordArt/WordArt.\n","authors":["Jun-Yan He","Zhi-Qi Cheng","Chenyang Li","Jingdong Sun","Wangmeng Xiang","Xianhui Lin","Xiaoyang Kang","Zengke Jin","Yusen Hu","Bin Luo","Yifeng Geng","Xuansong Xie","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.18332v2.pdf","comment":"Accepted by EMNLP 2023, 10 pages, 11 figures, 1 table, the system is\n at https://www.modelscope.cn/studios/WordArt/WordArt"},{"id":"http://arxiv.org/abs/2311.15529v1","updated":"2023-11-27T04:22:48Z","published":"2023-11-27T04:22:48Z","title":"Efficient Dataset Distillation via Minimax Diffusion","summary":" Dataset distillation reduces the storage and computational consumption of\ntraining a network by generating a small surrogate dataset that encapsulates\nrich information of the original large-scale one. However, previous\ndistillation methods heavily rely on the sample-wise iterative optimization\nscheme. As the images-per-class (IPC) setting or image resolution grows larger,\nthe necessary computation will demand overwhelming time and resources. In this\nwork, we intend to incorporate generative diffusion techniques for computing\nthe surrogate dataset. Observing that key factors for constructing an effective\nsurrogate dataset are representativeness and diversity, we design additional\nminimax criteria in the generative training to enhance these facets for the\ngenerated images of diffusion models. We present a theoretical model of the\nprocess as hierarchical diffusion control demonstrating the flexibility of the\ndiffusion process to target these criteria without jeopardizing the\nfaithfulness of the sample to the desired distribution. The proposed method\nachieves state-of-the-art validation performance while demanding much less\ncomputational resources. Under the 100-IPC setting on ImageWoof, our method\nrequires less than one-twentieth the distillation time of previous methods, yet\nyields even better performance. Source code available in\nhttps://github.com/vimar-gu/MinimaxDiffusion.\n","authors":["Jianyang Gu","Saeed Vahidian","Vyacheslav Kungurtsev","Haonan Wang","Wei Jiang","Yang You","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2311.15529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12532v2","updated":"2023-11-27T03:33:37Z","published":"2023-08-24T03:43:02Z","title":"FedSoL: Bridging Global Alignment and Local Generality in Federated\n Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclient data distributions are heterogeneous. Many previous FL algorithms have\naddressed this issue by introducing various proximal restrictions. These\nrestrictions aim to encourage global alignment by constraining the deviation of\nlocal learning from the global objective. However, they inherently limit local\nlearning by interfering with the original local objectives. Recently, an\nalternative approach has emerged to improve local learning generality. By\nobtaining local models within a smooth loss landscape, this approach mitigates\nconflicts among different local objectives of the clients. Yet, it does not\nensure stable global alignment, as local learning does not take the global\nobjective into account. In this study, we propose Federated Stability on\nLearning (FedSoL), which combines both the concepts of global alignment and\nlocal generality. In FedSoL, the local learning seeks a parameter region robust\nagainst proximal perturbations. This strategy introduces an implicit proximal\nrestriction effect in local learning while maintaining the original local\nobjective for parameter update. Our experiments show that FedSoL consistently\nachieves state-of-the-art performance on various setups.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v2.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.07206v2","updated":"2023-11-27T03:32:21Z","published":"2023-10-11T05:34:36Z","title":"DeepSimHO: Stable Pose Estimation for Hand-Object Interaction via\n Physics Simulation","summary":" This paper addresses the task of 3D pose estimation for a hand interacting\nwith an object from a single image observation. When modeling hand-object\ninteraction, previous works mainly exploit proximity cues, while overlooking\nthe dynamical nature that the hand must stably grasp the object to counteract\ngravity and thus preventing the object from slipping or falling. These works\nfail to leverage dynamical constraints in the estimation and consequently often\nproduce unstable results. Meanwhile, refining unstable configurations with\nphysics-based reasoning remains challenging, both by the complexity of contact\ndynamics and by the lack of effective and efficient physics inference in the\ndata-driven learning framework. To address both issues, we present DeepSimHO: a\nnovel deep-learning pipeline that combines forward physics simulation and\nbackward gradient approximation with a neural network. Specifically, for an\ninitial hand-object pose estimated by a base network, we forward it to a\nphysics simulator to evaluate its stability. However, due to non-smooth contact\ngeometry and penetration, existing differentiable simulators can not provide\nreliable state gradient. To remedy this, we further introduce a deep network to\nlearn the stability evaluation process from the simulator, while smoothly\napproximating its gradient and thus enabling effective back-propagation.\nExtensive experiments show that our method noticeably improves the stability of\nthe estimation and achieves superior efficiency over test-time optimization.\nThe code is available at https://github.com/rongakowang/DeepSimHO.\n","authors":["Rong Wang","Wei Mao","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2310.07206v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.15512v1","updated":"2023-11-27T03:15:48Z","published":"2023-11-27T03:15:48Z","title":"Sparse Pedestrian Character Learning for Trajectory Prediction","summary":" Pedestrian trajectory prediction in a first-person view has recently\nattracted much attention due to its importance in autonomous driving. Recent\nwork utilizes pedestrian character information, \\textit{i.e.}, action and\nappearance, to improve the learned trajectory embedding and achieves\nstate-of-the-art performance. However, it neglects the invalid and negative\npedestrian character information, which is harmful to trajectory representation\nand thus leads to performance degradation. To address this issue, we present a\ntwo-stream sparse-character-based network~(TSNet) for pedestrian trajectory\nprediction. Specifically, TSNet learns the negative-removed characters in the\nsparse character representation stream to improve the trajectory embedding\nobtained in the trajectory representation stream. Moreover, to model the\nnegative-removed characters, we propose a novel sparse character graph,\nincluding the sparse category and sparse temporal character graphs, to learn\nthe different effects of various characters in category and temporal\ndimensions, respectively. Extensive experiments on two first-person view\ndatasets, PIE and JAAD, show that our method outperforms existing\nstate-of-the-art methods. In addition, ablation studies demonstrate different\neffects of various characters and prove that TSNet outperforms approaches\nwithout eliminating negative characters.\n","authors":["Yonghao Dong","Le Wang","Sanpin Zhou","Gang Hua","Changyin Sun"],"pdf_url":"https://arxiv.org/pdf/2311.15512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15510v1","updated":"2023-11-27T03:09:58Z","published":"2023-11-27T03:09:58Z","title":"CaesarNeRF: Calibrated Semantic Representation for Few-shot\n Generalizable Neural Rendering","summary":" Generalizability and few-shot learning are key challenges in Neural Radiance\nFields (NeRF), often due to the lack of a holistic understanding in pixel-level\nrendering. We introduce CaesarNeRF, an end-to-end approach that leverages\nscene-level CAlibratEd SemAntic Representation along with pixel-level\nrepresentations to advance few-shot, generalizable neural rendering,\nfacilitating a holistic understanding without compromising high-quality\ndetails. CaesarNeRF explicitly models pose differences of reference views to\ncombine scene-level semantic representations, providing a calibrated holistic\nunderstanding. This calibration process aligns various viewpoints with precise\nlocation and is further enhanced by sequential refinement to capture varying\ndetails. Extensive experiments on public datasets, including LLFF, Shiny,\nmip-NeRF 360, and MVImgNet, show that CaesarNeRF delivers state-of-the-art\nperformance across varying numbers of reference views, proving effective even\nwith a single reference image. The project page of this work can be found at\nhttps://haidongz-usc.github.io/project/caesarnerf.\n","authors":["Haidong Zhu","Tianyu Ding","Tianyi Chen","Ilya Zharkov","Ram Nevatia","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2311.15510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11810v2","updated":"2023-11-27T02:53:33Z","published":"2023-11-20T14:42:25Z","title":"DocPedia: Unleashing the Power of Large Multimodal Model in the\n Frequency Domain for Versatile Document Understanding","summary":" This work presents DocPedia, a novel large multimodal model (LMM) for\nversatile OCR-free document understanding, capable of parsing images up to\n2,560$\\times$2,560 resolution. Unlike existing work either struggle with\nhigh-resolution documents or give up the large language model thus vision or\nlanguage ability constrained, our DocPedia directly processes visual input in\nthe frequency domain rather than the pixel space. The unique characteristic\nenables DocPedia to capture a greater amount of visual and textual information\nusing a limited number of visual tokens. To consistently enhance both\nperception and comprehension abilities of our model, we develop a dual-stage\ntraining strategy and enrich instructions/annotations of all training tasks\ncovering multiple document types. Extensive quantitative and qualitative\nexperiments conducted on various publicly available benchmarks confirm the\nmutual benefits of jointly learning perception and comprehension tasks. The\nresults provide further evidence of the effectiveness and superior performance\nof our DocPedia over other methods.\n","authors":["Hao Feng","Qi Liu","Hao Liu","Wengang Zhou","Houqiang Li","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2311.11810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15497v1","updated":"2023-11-27T02:48:06Z","published":"2023-11-27T02:48:06Z","title":"Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning\n and Optimization Functions for Enhanced Precision","summary":" Image registration has traditionally been done using two distinct approaches:\nlearning based methods, relying on robust deep neural networks, and\noptimization-based methods, applying complex mathematical transformations to\nwarp images accordingly. Of course, both paradigms offer advantages and\ndisadvantages, and, in this work, we seek to combine their respective strengths\ninto a single streamlined framework, using the outputs of the learning based\nmethod as initial parameters for optimization while prioritizing computational\npower for the image pairs that offer the greatest loss. Our investigations\nshowed that an improvement of 0.3\\% in testing when utilizing the best\nperforming state-of-the-art model as the backbone of the framework, while\nmaintaining the same inference time and with only a 0.8\\% loss in deformation\nfield smoothness.\n","authors":["Gabriel De Araujo","Shanlin Sun","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2311.15497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13404v2","updated":"2023-11-27T02:33:36Z","published":"2023-11-22T14:00:23Z","title":"Animatable 3D Gaussians for High-fidelity Synthesis of Human Motions","summary":" We present a novel animatable 3D Gaussian model for rendering high-fidelity\nfree-view human motions in real time. Compared to existing NeRF-based methods,\nthe model owns better capability in synthesizing high-frequency details without\nthe jittering problem across video frames. The core of our model is a novel\naugmented 3D Gaussian representation, which attaches each Gaussian with a\nlearnable code. The learnable code serves as a pose-dependent appearance\nembedding for refining the erroneous appearance caused by geometric\ntransformation of Gaussians, based on which an appearance refinement model is\nlearned to produce residual Gaussian properties to match the appearance in\ntarget pose. To force the Gaussians to learn the foreground human only without\nbackground interference, we further design a novel alpha loss to explicitly\nconstrain the Gaussians within the human body. We also propose to jointly\noptimize the human joint parameters to improve the appearance accuracy. The\nanimatable 3D Gaussian model can be learned with shallow MLPs, so new human\nmotions can be synthesized in real time (66 fps on avarage). Experiments show\nthat our model has superior performance over NeRF-based methods.\n","authors":["Keyang Ye","Tianjia Shao","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.13404v2.pdf","comment":"Some experiment data is wrong. The expression of the paper in\n introduction and abstract is incorrect. Some graphs have inappropriate\n descriptions"},{"id":"http://arxiv.org/abs/2311.15478v1","updated":"2023-11-27T01:41:25Z","published":"2023-11-27T01:41:25Z","title":"AerialBooth: Mutual Information Guidance for Text Controlled Aerial View\n Synthesis from a Single Image","summary":" We present a novel method, AerialBooth, for synthesizing the aerial view from\na single input image using its text description. We leverage the pretrained\ntext-to-2D image stable diffusion model as prior knowledge of the 3D world. The\nmodel is finetuned in two steps to optimize for the text embedding and the UNet\nthat reconstruct the input image and its inverse perspective mapping\nrespectively. The inverse perspective mapping creates variance within the\ntext-image space of the diffusion model, while providing weak guidance for\naerial view synthesis. At inference, we steer the contents of the generated\nimage towards the input image using novel mutual information guidance that\nmaximizes the information content between the probability distributions of the\ntwo images. We evaluate our approach on a wide spectrum of real and synthetic\ndata, including natural scenes, indoor scenes, human action, etc. Through\nextensive experiments and ablation studies, we demonstrate the effectiveness of\nAerialBooth and also its generalizability to other text-controlled views. We\nalso show that AerialBooth achieves the best viewpoint-fidelity trade-off\nthough quantitative evaluation on 7 metrics analyzing viewpoint and fidelity\nw.r.t. input image. Code and data is available at\nhttps://github.com/divyakraman/AerialBooth2023.\n","authors":["Divya Kothandaraman","Tianyi Zhou","Ming Lin","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2311.15478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15477v1","updated":"2023-11-27T01:24:31Z","published":"2023-11-27T01:24:31Z","title":"DreamCreature: Crafting Photorealistic Virtual Creatures from\n Imagination","summary":" Recent text-to-image (T2I) generative models allow for high-quality synthesis\nfollowing either text instructions or visual examples. Despite their\ncapabilities, these models face limitations in creating new, detailed creatures\nwithin specific categories (e.g., virtual dog or bird species), which are\nvaluable in digital asset creation and biodiversity analysis. To bridge this\ngap, we introduce a novel task, Virtual Creatures Generation: Given a set of\nunlabeled images of the target concepts (e.g., 200 bird species), we aim to\ntrain a T2I model capable of creating new, hybrid concepts within diverse\nbackgrounds and contexts. We propose a new method called DreamCreature, which\nidentifies and extracts the underlying sub-concepts (e.g., body parts of a\nspecific species) in an unsupervised manner. The T2I thus adapts to generate\nnovel concepts (e.g., new bird species) with faithful structures and\nphotorealistic appearance by seamlessly and flexibly composing learned\nsub-concepts. To enhance sub-concept fidelity and disentanglement, we extend\nthe textual inversion technique by incorporating an additional projector and\ntailored attention loss regularization. Extensive experiments on two\nfine-grained image benchmarks demonstrate the superiority of DreamCreature over\nprior methods in both qualitative and quantitative evaluation. Ultimately, the\nlearned sub-concepts facilitate diverse creative applications, including\ninnovative consumer product designs and nuanced property modifications.\n","authors":["Kam Woh Ng","Xiatian Zhu","Yi-Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2311.15477v1.pdf","comment":"Website: https://github.com/kamwoh/dreamcreature"},{"id":"http://arxiv.org/abs/2311.15475v1","updated":"2023-11-27T01:20:11Z","published":"2023-11-27T01:20:11Z","title":"MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers","summary":" We introduce MeshGPT, a new approach for generating triangle meshes that\nreflects the compactness typical of artist-created meshes, in contrast to dense\ntriangle meshes extracted by iso-surfacing methods from neural fields. Inspired\nby recent advances in powerful large language models, we adopt a sequence-based\napproach to autoregressively generate triangle meshes as sequences of\ntriangles. We first learn a vocabulary of latent quantized embeddings, using\ngraph convolutions, which inform these embeddings of the local mesh geometry\nand topology. These embeddings are sequenced and decoded into triangles by a\ndecoder, ensuring that they can effectively reconstruct the mesh. A transformer\nis then trained on this learned vocabulary to predict the index of the next\nembedding given previous embeddings. Once trained, our model can be\nautoregressively sampled to generate new triangle meshes, directly generating\ncompact meshes with sharp edges, more closely imitating the efficient\ntriangulation patterns of human-crafted meshes. MeshGPT demonstrates a notable\nimprovement over state of the art mesh generation methods, with a 9% increase\nin shape coverage and a 30-point enhancement in FID scores across various\ncategories.\n","authors":["Yawar Siddiqui","Antonio Alliegro","Alexey Artemov","Tatiana Tommasi","Daniele Sirigatti","Vladislav Rosov","Angela Dai","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2311.15475v1.pdf","comment":"Project Page: https://nihalsid.github.io/mesh-gpt/, Video:\n https://youtu.be/UV90O1_69_o"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.16075v1","updated":"2023-11-27T18:46:17Z","published":"2023-11-27T18:46:17Z","title":"BioLORD-2023: Semantic Textual Representations Fusing LLM and Clinical\n Knowledge Graph Insights","summary":" In this study, we investigate the potential of Large Language Models to\ncomplement biomedical knowledge graphs in the training of semantic models for\nthe biomedical and clinical domains. Drawing on the wealth of the UMLS\nknowledge graph and harnessing cutting-edge Large Language Models, we propose a\nnew state-of-the-art approach for obtaining high-fidelity representations of\nbiomedical concepts and sentences, consisting of three steps: an improved\ncontrastive learning phase, a novel self-distillation phase, and a weight\naveraging phase. Through rigorous evaluations via the extensive BioLORD testing\nsuite and diverse downstream tasks, we demonstrate consistent and substantial\nperformance improvements over the previous state of the art (e.g. +2pts on\nMedSTS, +2.5pts on MedNLI-S, +6.1pts on EHR-Rel-B). Besides our new\nstate-of-the-art biomedical model for English, we also distill and release a\nmultilingual model compatible with 50+ languages and finetuned on 7 European\nlanguages. Many clinical pipelines can benefit from our latest models. Our new\nmultilingual model enables a range of languages to benefit from our\nadvancements in biomedical semantic representation learning, opening a new\navenue for bioinformatics researchers around the world. As a result, we hope to\nsee BioLORD-2023 becoming a precious tool for future biomedical applications.\n","authors":["François Remy","Kris Demuynck","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2311.16075v1.pdf","comment":"Preprint of upcoming journal article"},{"id":"http://arxiv.org/abs/2310.13540v3","updated":"2023-11-27T15:33:04Z","published":"2023-10-20T14:36:09Z","title":"Thoroughly Modeling Multi-domain Pre-trained Recommendation as Language","summary":" With the thriving of pre-trained language model (PLM) widely verified in\nvarious of NLP tasks, pioneer efforts attempt to explore the possible\ncooperation of the general textual information in PLM with the personalized\nbehavioral information in user historical behavior sequences to enhance\nsequential recommendation (SR). However, despite the commonalities of input\nformat and task goal, there are huge gaps between the behavioral and textual\ninformation, which obstruct thoroughly modeling SR as language modeling via\nPLM. To bridge the gap, we propose a novel Unified pre-trained language model\nenhanced sequential recommendation (UPSR), aiming to build a unified\npre-trained recommendation model for multi-domain recommendation tasks. We\nformally design five key indicators, namely naturalness, domain consistency,\ninformativeness, noise & ambiguity, and text length, to guide the text-item\nadaptation and behavior sequence-text sequence adaptation differently for\npre-training and fine-tuning stages, which are essential but under-explored by\nprevious works. In experiments, we conduct extensive evaluations on seven\ndatasets with both tuning and zero-shot settings and achieve the overall best\nperformance. Comprehensive model analyses also provide valuable insights for\nbehavior modeling via PLM, shedding light on large pre-trained recommendation\nmodels. The source codes will be released in the future.\n","authors":["Zekai Qu","Ruobing Xie","Chaojun Xiao","Yuan Yao","Zhiyuan Liu","Fengzong Lian","Zhanhui Kang","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.13540v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15923v1","updated":"2023-11-27T15:32:52Z","published":"2023-11-27T15:32:52Z","title":"SEINE: SEgment-based Indexing for NEural information retrieval","summary":" Many early neural Information Retrieval (NeurIR) methods are re-rankers that\nrely on a traditional first-stage retriever due to expensive query time\ncomputations. Recently, representation-based retrievers have gained much\nattention, which learns query representation and document representation\nseparately, making it possible to pre-compute document representations offline\nand reduce the workload at query time. Both dense and sparse\nrepresentation-based retrievers have been explored. However, these methods\nfocus on finding the representation that best represents a text (aka metric\nlearning) and the actual retrieval function that is responsible for similarity\nmatching between query and document is kept at a minimum by using dot product.\nOne drawback is that unlike traditional term-level inverted index, the index\nformed by these embeddings cannot be easily re-used by another retrieval\nmethod. Another drawback is that keeping the interaction at minimum hurts\nretrieval effectiveness. On the contrary, interaction-based retrievers are\nknown for their better retrieval effectiveness. In this paper, we propose a\nnovel SEgment-based Neural Indexing method, SEINE, which provides a general\nindexing framework that can flexibly support a variety of interaction-based\nneural retrieval methods. We emphasize on a careful decomposition of common\ncomponents in existing neural retrieval methods and propose to use\nsegment-level inverted index to store the atomic query-document interaction\nvalues. Experiments on LETOR MQ2007 and MQ2008 datasets show that our indexing\nmethod can accelerate multiple neural retrieval methods up to 28-times faster\nwithout sacrificing much effectiveness.\n","authors":["Sibo Dong","Justin Goldstein","Grace Hui Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14084v2","updated":"2023-11-27T13:43:19Z","published":"2023-11-23T16:22:58Z","title":"AI-Generated Images Introduce Invisible Relevance Bias to Text-Image\n Retrieval","summary":" With the advancement of generation models, AI-generated content (AIGC) is\nbecoming more realistic, flooding the Internet. A recent study suggests that\nthis phenomenon has elevated the issue of source bias in text retrieval for web\nsearches. Specifically, neural retrieval models tend to rank generated texts\nhigher than human-written texts. In this paper, we extend the study of this\nbias to cross-modal retrieval. Firstly, we successfully construct a suitable\nbenchmark to explore the existence of the bias. Subsequent extensive\nexperiments on this benchmark reveal that AI-generated images introduce an\ninvisible relevance bias to text-image retrieval models. Specifically, our\nexperiments show that text-image retrieval models tend to rank the AI-generated\nimages higher than the real images, even though the AI-generated images do not\nexhibit more visually relevant features to the query than real images. This\ninvisible relevance bias is prevalent across retrieval models with varying\ntraining data and architectures. Furthermore, our subsequent exploration\nreveals that the inclusion of AI-generated images in the training data of the\nretrieval models exacerbates the invisible relevance bias. The above phenomenon\ntriggers a vicious cycle, which makes the invisible relevance bias become more\nand more serious. To elucidate the potential causes of invisible relevance and\naddress the aforementioned issues, we introduce an effective training method\naimed at alleviating the invisible relevance bias. Subsequently, we apply our\nproposed debiasing method to retroactively identify the causes of invisible\nrelevance, revealing that the AI-generated images induce the image encoder to\nembed additional information into their representation. This information\nexhibits a certain consistency across generated images with different semantics\nand can make the retriever estimate a higher relevance score.\n","authors":["Shicheng Xu","Danyang Hou","Liang Pang","Jingcheng Deng","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.14084v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2311.15790v1","updated":"2023-11-27T13:04:33Z","published":"2023-11-27T13:04:33Z","title":"A Social-aware Gaussian Pre-trained Model for Effective Cold-start\n Recommendation","summary":" The use of pre-training is an emerging technique to enhance a neural model's\nperformance, which has been shown to be effective for many neural language\nmodels such as BERT. This technique has also been used to enhance the\nperformance of recommender systems. In such recommender systems, pre-training\nmodels are used to learn a better initialisation for both users and items.\nHowever, recent existing pre-trained recommender systems tend to only\nincorporate the user interaction data at the pre-training stage, making it\ndifficult to deliver good recommendations, especially when the interaction data\nis sparse. To alleviate this common data sparsity issue, we propose to\npre-train the recommendation model not only with the interaction data but also\nwith other available information such as the social relations among users,\nthereby providing the recommender system with a better initialisation compared\nwith solely relying on the user interaction data. We propose a novel\nrecommendation model, the Social-aware Gaussian Pre-trained model (SGP), which\nencodes the user social relations and interaction data at the pre-training\nstage in a Graph Neural Network (GNN). Afterwards, in the subsequent\nfine-tuning stage, our SGP model adopts a Gaussian Mixture Model (GMM) to\nfactorise these pre-trained embeddings for further training, thereby benefiting\nthe cold-start users from these pre-built social relations. Our extensive\nexperiments on three public datasets show that, in comparison to 16 competitive\nbaselines, our SGP model significantly outperforms the best baseline by upto\n7.7% in terms of NDCG@10. In addition, we show that SGP permits to effectively\nalleviate the cold-start problem, especially when users newly register to the\nsystem through their friends' suggestions.\n","authors":["Siwei Liu","Xi Wang","Craig Macdonald","Iadh Ounis"],"pdf_url":"https://arxiv.org/pdf/2311.15790v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2311.15716v1","updated":"2023-11-27T10:59:16Z","published":"2023-11-27T10:59:16Z","title":"Justifiable Artificial Intelligence: Engineering Large Language Models\n for Legal Applications","summary":" In this work, I discuss how Large Language Models can be applied in the legal\ndomain, circumventing their current drawbacks. Despite their large success and\nacceptance, their lack of explainability hinders legal experts to trust in\ntheir output, and this happens rightfully so. However, in this paper, I argue\nin favor of a new view, Justifiable Artificial Intelligence, instead of\nfocusing on Explainable Artificial Intelligence. I discuss in this paper how\ngaining evidence for and against a Large Language Model's output may make their\ngenerated texts more trustworthy - or hold them accountable for misinformation.\n","authors":["Sabine Wehnert"],"pdf_url":"https://arxiv.org/pdf/2311.15716v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.10230v3","updated":"2023-11-27T10:38:30Z","published":"2023-07-15T11:49:43Z","title":"Prompt Tuning on Graph-augmented Low-resource Text Classification","summary":" Text classification is a fundamental problem in information retrieval with\nmany real-world applications, such as predicting the topics of online articles\nand the categories of e-commerce product descriptions. However, low-resource\ntext classification, with no or few labeled samples, presents a serious concern\nfor supervised learning. Meanwhile, many text data are inherently grounded on a\nnetwork structure, such as a hyperlink/citation network for online articles,\nand a user-item purchase network for e-commerce products. These graph\nstructures capture rich semantic relationships, which can potentially augment\nlow-resource text classification. In this paper, we propose a novel model\ncalled Graph-Grounded Pre-training and Prompting (G2P2) to address low-resource\ntext classification in a two-pronged approach. During pre-training, we propose\nthree graph interaction-based contrastive strategies to jointly pre-train a\ngraph-text model; during downstream classification, we explore handcrafted\ndiscrete prompts and continuous prompt tuning for the jointly pre-trained model\nto achieve zero- and few-shot classification, respectively. Moreover, we\nexplore the possibility of employing continuous prompt tuning for zero-shot\ninference. Specifically, we aim to generalize continuous prompts to unseen\nclasses while leveraging a set of base classes. To this end, we extend G2P2\ninto G2P2$^*$, hinging on a new architecture of conditional prompt tuning.\nExtensive experiments on four real-world datasets demonstrate the strength of\nG2P2 in zero- and few-shot low-resource text classification tasks, and\nillustrate the advantage of G2P2$^*$ in dealing with unseen classes.\n","authors":["Zhihao Wen","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2307.10230v3.pdf","comment":"14 pages, journal under review. arXiv admin note: substantial text\n overlap with arXiv:2305.03324"},{"id":"http://arxiv.org/abs/2311.15689v1","updated":"2023-11-27T10:28:06Z","published":"2023-11-27T10:28:06Z","title":"Two Approaches to the Identity of Processes in BFO","summary":" This paper aims to explore processes and their identity with a focus on the\nupper ontology Basic Formal Ontology (BFO). We begin with a classification\nbased on two basic classes of changes of independent continuants: changes with\nrespect to a single specifically dependent continuant thereof or with respect\nto the spatial region that its parts occupy. We accordingly distinguish two\nkinds of simple processes: specifically dependent continuant changes and\nspatial changes. Next, we investigate a compositional approach to the identity\nof processes: the identity of any process is determined by the identity of the\nsimple processes that compose them. Then, we consider a causal approach to the\nidentity of processes with recourse to a dispositional view of processes\naccording to which any process is a realization of some disposition. We also\nexamine assumptions on which these two approaches to the identity of processes\nare based.\n","authors":["Fumiaki Toyoshima","Adrien Barton"],"pdf_url":"https://arxiv.org/pdf/2311.15689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15578v1","updated":"2023-11-27T07:11:47Z","published":"2023-11-27T07:11:47Z","title":"Experimental Analysis of Large-scale Learnable Vector Storage\n Compression","summary":" Learnable embedding vector is one of the most important applications in\nmachine learning, and is widely used in various database-related domains.\nHowever, the high dimensionality of sparse data in recommendation tasks and the\nhuge volume of corpus in retrieval-related tasks lead to a large memory\nconsumption of the embedding table, which poses a great challenge to the\ntraining and deployment of models. Recent research has proposed various methods\nto compress the embeddings at the cost of a slight decrease in model quality or\nthe introduction of other overheads. Nevertheless, the relative performance of\nthese methods remains unclear. Existing experimental comparisons only cover a\nsubset of these methods and focus on limited metrics. In this paper, we perform\na comprehensive comparative analysis and experimental evaluation of embedding\ncompression. We introduce a new taxonomy that categorizes these techniques\nbased on their characteristics and methodologies, and further develop a modular\nbenchmarking framework that integrates 14 representative methods. Under a\nuniform test environment, our benchmark fairly evaluates each approach,\npresents their strengths and weaknesses under different memory budgets, and\nrecommends the best method based on the use case. In addition to providing\nuseful guidelines, our study also uncovers the limitations of current methods\nand suggests potential directions for future research.\n","authors":["Hailin Zhang","Penghao Zhao","Xupeng Miao","Yingxia Shao","Zirui Liu","Tong Yang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2311.15578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15564v1","updated":"2023-11-27T06:22:57Z","published":"2023-11-27T06:22:57Z","title":"Boot and Switch: Alternating Distillation for Zero-Shot Dense Retrieval","summary":" Neural 'dense' retrieval models are state of the art for many datasets,\nhowever these models often exhibit limited domain transfer ability. Existing\napproaches to adaptation are unwieldy, such as requiring explicit supervision,\ncomplex model architectures, or massive external models. We present\n$\\texttt{ABEL}$, a simple but effective unsupervised method to enhance passage\nretrieval in zero-shot settings. Our technique follows a straightforward loop:\na dense retriever learns from supervision signals provided by a reranker, and\nsubsequently, the reranker is updated based on feedback from the improved\nretriever. By iterating this loop, the two components mutually enhance one\nanother's performance. Experimental results demonstrate that our unsupervised\n$\\texttt{ABEL}$ model outperforms both leading supervised and unsupervised\nretrievers on the BEIR benchmark. Meanwhile, it exhibits strong adaptation\nabilities to tasks and domains that were unseen during training. By either\nfine-tuning $\\texttt{ABEL}$ on labelled data or integrating it with existing\nsupervised dense retrievers, we achieve state-of-the-art\nresults.\\footnote{Source code is available at\n\\url{https://github.com/Fantabulous-J/BootSwitch}.}\n","authors":["Fan Jiang","Qiongkai Xu","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2311.15564v1.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.15563v1","updated":"2023-11-27T06:19:50Z","published":"2023-11-27T06:19:50Z","title":"Noisy Self-Training with Synthetic Queries for Dense Retrieval","summary":" Although existing neural retrieval models reveal promising results when\ntraining data is abundant and the performance keeps improving as training data\nincreases, collecting high-quality annotated data is prohibitively costly. To\nthis end, we introduce a novel noisy self-training framework combined with\nsynthetic queries, showing that neural retrievers can be improved in a\nself-evolution manner with no reliance on any external models. Experimental\nresults show that our method improves consistently over existing methods on\nboth general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval\nbenchmarks. Extra analysis on low-resource settings reveals that our method is\ndata efficient and outperforms competitive baselines, with as little as 30% of\nlabelled training data. Further extending the framework for reranker training\ndemonstrates that the proposed method is general and yields additional gains on\ntasks of diverse domains.\\footnote{Source code is available at\n\\url{https://github.com/Fantabulous-J/Self-Training-DPR}}\n","authors":["Fan Jiang","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2311.15563v1.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.13534v2","updated":"2023-11-27T02:52:46Z","published":"2023-11-22T17:14:54Z","title":"LM-Cocktail: Resilient Tuning of Language Models via Model Merging","summary":" The pre-trained language models are continually fine-tuned to better support\ndownstream applications. However, this operation may result in significant\nperformance degeneration on general tasks beyond the targeted domain. To\novercome this problem, we propose a novel method which enables the fine-tuned\nmodel to stay resilient in general perspectives. Our method is conducted in the\nform of model merging (namely LM-Cocktail), where the fine-tuned language model\nis merged with the pre-trained base model or the peer models from other domains\nthrough weighted average. Despite simplicity, LM-Cocktail is surprisingly\neffective: the resulted model is able to achieve a strong empirical performance\nin the whole scope of general tasks while preserving a superior capacity in its\ntargeted domain. We conduct comprehensive experiments with LLama and BGE model\non popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the\nefficacy of our proposed method. The code and checkpoints are available at\nhttps://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Xingrun Xing"],"pdf_url":"https://arxiv.org/pdf/2311.13534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15493v1","updated":"2023-11-27T02:30:39Z","published":"2023-11-27T02:30:39Z","title":"UFIN: Universal Feature Interaction Network for Multi-Domain\n Click-Through Rate Prediction","summary":" Click-Through Rate (CTR) prediction, which aims to estimate the probability\nof a user clicking on an item, is a key task in online advertising. Numerous\nexisting CTR models concentrate on modeling the feature interactions within a\nsolitary domain, thereby rendering them inadequate for fulfilling the\nrequisites of multi-domain recommendations in real industrial scenarios. Some\nrecent approaches propose intricate architectures to enhance knowledge sharing\nand augment model training across multiple domains. However, these approaches\nencounter difficulties when being transferred to new recommendation domains,\nowing to their reliance on the modeling of ID features (e.g., item id). To\naddress the above issue, we propose the Universal Feature Interaction Network\n(UFIN) approach for CTR prediction. UFIN exploits textual data to learn\nuniversal feature interactions that can be effectively transferred across\ndiverse domains. For learning universal feature representations, we regard the\ntext and feature as two different modalities and propose an encoder-decoder\nnetwork founded on a Large Language Model (LLM) to enforce the transfer of data\nfrom the text modality to the feature modality. Building upon the above\nfoundation, we further develop a mixtureof-experts (MoE) enhanced adaptive\nfeature interaction model to learn transferable collaborative patterns across\nmultiple domains. Furthermore, we propose a multi-domain knowledge distillation\nframework to enhance feature interaction learning. Based on the above methods,\nUFIN can effectively bridge the semantic gap to learn common knowledge across\nvarious domains, surpassing the constraints of ID-based models. Extensive\nexperiments conducted on eight datasets show the effectiveness of UFIN, in both\nmultidomain and cross-platform settings. Our code is available at\nhttps://github.com/RUCAIBox/UFIN.\n","authors":["Zhen Tian","Changwang Zhang","Wayne Xin Zhao","Xin Zhao","Ji-Rong Wen","Zhao Cao"],"pdf_url":"https://arxiv.org/pdf/2311.15493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16334v1","updated":"2023-11-27T21:38:10Z","published":"2023-11-27T21:38:10Z","title":"Robust Basket Recommendation via Noise-tolerated Graph Contrastive\n Learning","summary":" The growth of e-commerce has seen a surge in popularity of platforms like\nAmazon, eBay, and Taobao. This has given rise to a unique shopping behavior\ninvolving baskets - sets of items purchased together. As a less studied\ninteraction mode in the community, the question of how should shopping basket\ncomplement personalized recommendation systems remains under-explored. While\nprevious attempts focused on jointly modeling user purchases and baskets, the\ndistinct semantic nature of these elements can introduce noise when directly\nintegrated. This noise negatively impacts the model's performance, further\nexacerbated by significant noise within both user and basket behaviors.\n In order to cope with the above difficulties, we propose a novel Basket\nrecommendation framework via Noise-tolerated Contrastive Learning, named BNCL,\nto handle the noise existing in the cross-behavior integration and\nwithin-behavior modeling. First, we represent the basket-item interactions as\nthe hypergraph to model the complex basket behavior, where all items appearing\nin the same basket are treated as a single hyperedge. Second, cross-behavior\ncontrastive learning is designed to suppress the noise during the fusion of\ndiverse behaviors. Next, to further inhibit the within-behavior noise of the\nuser and basket interactions, we propose to exploit invariant properties of the\nrecommenders w.r.t augmentations through within-behavior contrastive learning.\nA novel consistency-aware augmentation approach is further designed to better\nidentify noisy interactions with the consideration of the above two types of\ninteractions. Our framework BNCL offers a generic training paradigm that is\napplicable to different backbones. Extensive experiments on three shopping\ntransaction datasets verify the effectiveness of our proposed method. Our code\nis available.\n","authors":["Xinrui He","Tianxin Wei","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2311.16334v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2212.09744v2","updated":"2023-11-27T19:57:09Z","published":"2022-12-19T18:59:34Z","title":"DSI++: Updating Transformer Memory with New Documents","summary":" Differentiable Search Indices (DSIs) encode a corpus of documents in model\nparameters and use the same model to answer user queries directly. Despite the\nstrong performance of DSI models, deploying them in situations where the corpus\nchanges over time is computationally expensive because reindexing the corpus\nrequires re-training the model. In this work, we introduce DSI++, a continual\nlearning challenge for DSI to incrementally index new documents while being\nable to answer queries related to both previously and newly indexed documents.\nAcross different model scales and document identifier representations, we show\nthat continual indexing of new documents leads to considerable forgetting of\npreviously indexed documents. We also hypothesize and verify that the model\nexperiences forgetting events during training, leading to unstable learning. To\nmitigate these issues, we investigate two approaches. The first focuses on\nmodifying the training dynamics. Flatter minima implicitly alleviate\nforgetting, so we optimize for flatter loss basins and show that the model\nstably memorizes more documents ($+12\\%$). Next, we introduce a generative\nmemory to sample pseudo-queries for documents and supplement them during\ncontinual indexing to prevent forgetting for the retrieval task. Extensive\nexperiments on novel continual indexing benchmarks based on Natural Questions\n(NQ) and MS MARCO demonstrate that our proposed solution mitigates forgetting\nsignificantly. Concretely, it improves the average Hits@10 by $+21.1\\%$ over\ncompetitive baselines for NQ and requires $6$ times fewer model updates\ncompared to re-training the DSI model for incrementally indexing five corpora\nin a sequence.\n","authors":["Sanket Vaibhav Mehta","Jai Gupta","Yi Tay","Mostafa Dehghani","Vinh Q. Tran","Jinfeng Rao","Marc Najork","Emma Strubell","Donald Metzler"],"pdf_url":"https://arxiv.org/pdf/2212.09744v2.pdf","comment":"Accepted at EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2311.16207v1","updated":"2023-11-27T15:34:14Z","published":"2023-11-27T15:34:14Z","title":"The Graph Convolutional Network with Multi-representation Alignment for\n Drug Synergy Prediction","summary":" Drug combination refers to the use of two or more drugs to treat a specific\ndisease at the same time. It is currently the mainstream way to treat complex\ndiseases. Compared with single drugs, drug combinations have better efficacy\nand can better inhibit toxicity and drug resistance. The computational model\nbased on deep learning concatenates the representation of multiple drugs and\nthe corresponding cell line feature as input, and the output is whether the\ndrug combination can have an inhibitory effect on the cell line. However, this\nstrategy of concatenating multiple representations has the following defects:\nthe alignment of drug representation and cell line representation is ignored,\nresulting in the synergistic relationship not being reflected positionally in\nthe embedding space. Moreover, the alignment measurement function in deep\nlearning cannot be suitable for drug synergy prediction tasks due to\ndifferences in input types. Therefore, in this work, we propose a graph\nconvolutional network with multi-representation alignment (GCNMRA) for\npredicting drug synergy. In the GCNMRA model, we designed a\nmulti-representation alignment function suitable for the drug synergy\nprediction task so that the positional relationship between drug\nrepresentations and cell line representation is reflected in the embedding\nspace. In addition, the vector modulus of drug representations and cell line\nrepresentation is considered to improve the accuracy of calculation results and\naccelerate model convergence. Finally, many relevant experiments were run on\nmultiple drug synergy datasets to verify the effectiveness of the above\ninnovative elements and the excellence of the GCNMRA model.\n","authors":["Xinxing Yang","Genke Yang","Jian Chu"],"pdf_url":"https://arxiv.org/pdf/2311.16207v1.pdf","comment":"14 pages;"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.16102v1","updated":"2023-11-27T18:59:53Z","published":"2023-11-27T18:59:53Z","title":"Test-time Adaptation of Discriminative Models via Diffusion Generative\n Feedback","summary":" The advancements in generative modeling, particularly the advent of diffusion\nmodels, have sparked a fundamental question: how can these models be\neffectively used for discriminative tasks? In this work, we find that\ngenerative models can be great test-time adapters for discriminative models.\nOur method, Diffusion-TTA, adapts pre-trained discriminative models such as\nimage classifiers, segmenters and depth predictors, to each unlabelled example\nin the test set using generative feedback from a diffusion model. We achieve\nthis by modulating the conditioning of the diffusion model using the output of\nthe discriminative model. We then maximize the image likelihood objective by\nbackpropagating the gradients to discriminative model's parameters. We show\nDiffusion-TTA significantly enhances the accuracy of various large-scale\npre-trained discriminative models, such as, ImageNet classifiers, CLIP models,\nimage pixel labellers and image depth predictors. Diffusion-TTA outperforms\nexisting test-time adaptation methods, including TTT-MAE and TENT, and\nparticularly shines in online adaptation setups, where the discriminative model\nis continually adapted to each example in the test set. We provide access to\ncode, results, and visualizations on our website:\nhttps://diffusion-tta.github.io/.\n","authors":["Mihir Prabhudesai","Tsung-Wei Ke","Alexander C. Li","Deepak Pathak","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2311.16102v1.pdf","comment":"Accepted at NeurIPS 2023 Webpage with Code:\n https://diffusion-tta.github.io/"},{"id":"http://arxiv.org/abs/2311.16101v1","updated":"2023-11-27T18:59:42Z","published":"2023-11-27T18:59:42Z","title":"How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for\n Vision LLMs","summary":" This work focuses on the potential of Vision LLMs (VLLMs) in visual\nreasoning. Different from prior studies, we shift our focus from evaluating\nstandard performance to introducing a comprehensive safety evaluation suite,\ncovering both out-of-distribution (OOD) generalization and adversarial\nrobustness. For the OOD evaluation, we present two novel VQA datasets, each\nwith one variant, designed to test model performance under challenging\nconditions. In exploring adversarial robustness, we propose a straightforward\nattack strategy for misleading VLLMs to produce visual-unrelated responses.\nMoreover, we assess the efficacy of two jailbreaking strategies, targeting\neither the vision or language component of VLLMs. Our evaluation of 21 diverse\nmodels, ranging from open-source VLLMs to GPT-4V, yields interesting\nobservations: 1) Current VLLMs struggle with OOD texts but not images, unless\nthe visual information is limited; and 2) These VLLMs can be easily misled by\ndeceiving vision encoders only, and their vision-language training often\ncompromise safety protocols. We release this safety evaluation suite at\nhttps://github.com/UCSC-VLAA/vllm-safety-benchmark.\n","authors":["Haoqin Tu","Chenhang Cui","Zijun Wang","Yiyang Zhou","Bingchen Zhao","Junlin Han","Wangchunshu Zhou","Huaxiu Yao","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2311.16101v1.pdf","comment":"H.T., C.C., and Z.W. contribute equally. Work done during H.T. and\n Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC"},{"id":"http://arxiv.org/abs/2311.16098v1","updated":"2023-11-27T18:59:25Z","published":"2023-11-27T18:59:25Z","title":"On Bringing Robots Home","summary":" Throughout history, we have successfully integrated various machines into our\nhomes. Dishwashers, laundry machines, stand mixers, and robot vacuums are a few\nrecent examples. However, these machines excel at performing only a single task\neffectively. The concept of a \"generalist machine\" in homes - a domestic\nassistant that can adapt and learn from our needs, all while remaining\ncost-effective - has long been a goal in robotics that has been steadily\npursued for decades. In this work, we initiate a large-scale effort towards\nthis goal by introducing Dobb-E, an affordable yet versatile general-purpose\nsystem for learning robotic manipulation within household settings. Dobb-E can\nlearn a new task with only five minutes of a user showing it how to do it,\nthanks to a demonstration collection tool (\"The Stick\") we built out of cheap\nparts and iPhones. We use the Stick to collect 13 hours of data in 22 homes of\nNew York City, and train Home Pretrained Representations (HPR). Then, in a\nnovel home environment, with five minutes of demonstrations and fifteen minutes\nof adapting the HPR model, we show that Dobb-E can reliably solve the task on\nthe Stretch, a mobile robot readily available on the market. Across roughly 30\ndays of experimentation in homes of New York City and surrounding areas, we\ntest our system in 10 homes, with a total of 109 tasks in different\nenvironments, and finally achieve a success rate of 81%. Beyond success\npercentages, our experiments reveal a plethora of unique challenges absent or\nignored in lab robotics. These range from effects of strong shadows, to\nvariable demonstration quality by non-expert users. With the hope of\naccelerating research on home robots, and eventually seeing robot butlers in\nevery home, we open-source Dobb-E software stack and models, our data, and our\nhardware designs at https://dobb-e.com\n","authors":["Nur Muhammad Mahi Shafiullah","Anant Rai","Haritheja Etukuru","Yiqian Liu","Ishan Misra","Soumith Chintala","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2311.16098v1.pdf","comment":"Project website and videos are available at https://dobb-e.com,\n technical documentation for getting started is available at\n https://docs.dobb-e.com, and code is released at\n https://github.com/notmahi/dobb-e"},{"id":"http://arxiv.org/abs/2311.16093v1","updated":"2023-11-27T18:58:34Z","published":"2023-11-27T18:58:34Z","title":"Have we built machines that think like people?","summary":" A chief goal of artificial intelligence is to build machines that think like\npeople. Yet it has been argued that deep neural network architectures fail to\naccomplish this. Researchers have asserted these models' limitations in the\ndomains of causal reasoning, intuitive physics, and intuitive psychology. Yet\nrecent advancements, namely the rise of large language models, particularly\nthose designed for visual processing, have rekindled interest in the potential\nto emulate human-like cognitive abilities. This paper evaluates the current\nstate of vision-based large language models in the domains of intuitive\nphysics, causal reasoning, and intuitive psychology. Through a series of\ncontrolled experiments, we investigate the extent to which these modern models\ngrasp complex physical interactions, causal relationships, and intuitive\nunderstanding of others' preferences. Our findings reveal that, while these\nmodels demonstrate a notable proficiency in processing and interpreting visual\ndata, they still fall short of human capabilities in these areas. The models\nexhibit a rudimentary understanding of physical laws and causal relationships,\nbut their performance is hindered by a lack of deeper insights-a key aspect of\nhuman cognition. Furthermore, in tasks requiring an intuitive theory of mind,\nthe models fail altogether. Our results emphasize the need for integrating more\nrobust mechanisms for understanding causality, physical dynamics, and social\ncognition into modern-day, vision-based language models, and point out the\nimportance of cognitively-inspired benchmarks.\n","authors":["Luca M. Schulze Buschoff","Elif Akata","Matthias Bethge","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2311.16093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16091v1","updated":"2023-11-27T18:57:42Z","published":"2023-11-27T18:57:42Z","title":"Interactive Autonomous Navigation with Internal State Inference and\n Interactivity Estimation","summary":" Deep reinforcement learning (DRL) provides a promising way for intelligent\nagents (e.g., autonomous vehicles) to learn to navigate complex scenarios.\nHowever, DRL with neural networks as function approximators is typically\nconsidered a black box with little explainability and often suffers from\nsuboptimal performance, especially for autonomous navigation in highly\ninteractive multi-agent environments. To address these issues, we propose three\nauxiliary tasks with spatio-temporal relational reasoning and integrate them\ninto the standard DRL framework, which improves the decision making performance\nand provides explainable intermediate indicators. We propose to explicitly\ninfer the internal states (i.e., traits and intentions) of surrounding agents\n(e.g., human drivers) as well as to predict their future trajectories in the\nsituations with and without the ego agent through counterfactual reasoning.\nThese auxiliary tasks provide additional supervision signals to infer the\nbehavior patterns of other interactive agents. Multiple variants of framework\nintegration strategies are compared. We also employ a spatio-temporal graph\nneural network to encode relations between dynamic entities, which enhances\nboth internal state inference and decision making of the ego agent. Moreover,\nwe propose an interactivity estimation mechanism based on the difference\nbetween predicted trajectories in these two situations, which indicates the\ndegree of influence of the ego agent on other agents. To validate the proposed\nmethod, we design an intersection driving simulator based on the Intelligent\nIntersection Driver Model (IIDM) that simulates vehicles and pedestrians. Our\napproach achieves robust and state-of-the-art performance in terms of standard\nevaluation metrics and provides explainable intermediate indicators (i.e.,\ninternal states, and interactivity scores) for decision making.\n","authors":["Jiachen Li","David Isele","Kanghoon Lee","Jinkyoo Park","Kikuo Fujimura","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2311.16091v1.pdf","comment":"18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2311.16086v1","updated":"2023-11-27T18:56:03Z","published":"2023-11-27T18:56:03Z","title":"MAST: Model-Agnostic Sparsified Training","summary":" We introduce a novel optimization problem formulation that departs from the\nconventional way of minimizing machine learning model loss as a black-box\nfunction. Unlike traditional formulations, the proposed approach explicitly\nincorporates an initially pre-trained model and random sketch operators,\nallowing for sparsification of both the model and gradient during training. We\nestablish insightful properties of the proposed objective function and\nhighlight its connections to the standard formulation. Furthermore, we present\nseveral variants of the Stochastic Gradient Descent (SGD) method adapted to the\nnew problem formulation, including SGD with general sampling, a distributed\nversion, and SGD with variance reduction techniques. We achieve tighter\nconvergence rates and relax assumptions, bridging the gap between theoretical\nprinciples and practical applications, covering several important techniques\nsuch as Dropout and Sparse training. This work presents promising opportunities\nto enhance the theoretical understanding of model training through a\nsparsification-aware optimization approach.\n","authors":["Yury Demidovich","Grigory Malinovsky","Egor Shulgin","Peter Richtárik"],"pdf_url":"https://arxiv.org/pdf/2311.16086v1.pdf","comment":"58 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.16082v1","updated":"2023-11-27T18:52:25Z","published":"2023-11-27T18:52:25Z","title":"Transformer-QEC: Quantum Error Correction Code Decoding with\n Transferable Transformers","summary":" Quantum computing has the potential to solve problems that are intractable\nfor classical systems, yet the high error rates in contemporary quantum devices\noften exceed tolerable limits for useful algorithm execution. Quantum Error\nCorrection (QEC) mitigates this by employing redundancy, distributing quantum\ninformation across multiple data qubits and utilizing syndrome qubits to\nmonitor their states for errors. The syndromes are subsequently interpreted by\na decoding algorithm to identify and correct errors in the data qubits. This\ntask is complex due to the multiplicity of error sources affecting both data\nand syndrome qubits as well as syndrome extraction operations. Additionally,\nidentical syndromes can emanate from different error sources, necessitating a\ndecoding algorithm that evaluates syndromes collectively. Although machine\nlearning (ML) decoders such as multi-layer perceptrons (MLPs) and convolutional\nneural networks (CNNs) have been proposed, they often focus on local syndrome\nregions and require retraining when adjusting for different code distances. We\nintroduce a transformer-based QEC decoder which employs self-attention to\nachieve a global receptive field across all input syndromes. It incorporates a\nmixed loss training approach, combining both local physical error and global\nparity label losses. Moreover, the transformer architecture's inherent\nadaptability to variable-length inputs allows for efficient transfer learning,\nenabling the decoder to adapt to varying code distances without retraining.\n Evaluation on six code distances and ten different error configurations\ndemonstrates that our model consistently outperforms non-ML decoders, such as\nUnion Find (UF) and Minimum Weight Perfect Matching (MWPM), and other ML\ndecoders, thereby achieving best logical error rates. Moreover, the transfer\nlearning can save over 10x of training cost.\n","authors":["Hanrui Wang","Pengyu Liu","Kevin Shao","Dantong Li","Jiaqi Gu","David Z. Pan","Yongshan Ding","Song Han"],"pdf_url":"https://arxiv.org/pdf/2311.16082v1.pdf","comment":"Accepted to ICCAD 2023, FAST ML for Science Workshop; 7 pages, 8\n figures"},{"id":"http://arxiv.org/abs/2311.16080v1","updated":"2023-11-27T18:50:37Z","published":"2023-11-27T18:50:37Z","title":"XLB: Distributed Multi-GPU Lattice Boltzmann Simulation Framework for\n Differentiable Scientific Machine Learning","summary":" The lattice Boltzmann method (LBM) has emerged as a prominent technique for\nsolving fluid dynamics problems due to its algorithmic potential for\ncomputational scalability. We introduce XLB framework, a Python-based\ndifferentiable LBM library which harnesses the capabilities of the JAX\nframework. The architecture of XLB is predicated upon ensuring accessibility,\nextensibility, and computational performance, enabling scaling effectively\nacross CPU, multi-GPU, and distributed multi-GPU systems. The framework can be\nreadily augmented with novel boundary conditions, collision models, or\nsimulation capabilities. XLB offers the unique advantage of integration with\nJAX's extensive machine learning echosystem, and the ability to utilize\nautomatic differentiation for tackling physics-based machine learning,\noptimization, and inverse problems. XLB has been successfully scaled to handle\nsimulations with billions of cells, achieving giga-scale lattice updates per\nsecond. XLB is released under the permissive Apache-2.0 license and is\navailable on GitHub at https://github.com/Autodesk/XLB.\n","authors":["Mohammadmehdi Ataei","Hesam Salehipour"],"pdf_url":"https://arxiv.org/pdf/2311.16080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16079v1","updated":"2023-11-27T18:49:43Z","published":"2023-11-27T18:49:43Z","title":"MEDITRON-70B: Scaling Medical Pretraining for Large Language Models","summary":" Large language models (LLMs) can potentially democratize access to medical\nknowledge. While many efforts have been made to harness and improve LLMs'\nmedical knowledge and reasoning capacities, the resulting models are either\nclosed-source (e.g., PaLM, GPT-4) or limited in scale (<= 13B parameters),\nwhich restricts their abilities. In this work, we improve access to large-scale\nmedical LLMs by releasing MEDITRON: a suite of open-source LLMs with 7B and 70B\nparameters adapted to the medical domain. MEDITRON builds on Llama-2 (through\nour adaptation of Nvidia's Megatron-LM distributed trainer), and extends\npretraining on a comprehensively curated medical corpus, including selected\nPubMed articles, abstracts, and internationally-recognized medical guidelines.\nEvaluations using four major medical benchmarks show significant performance\ngains over several state-of-the-art baselines before and after task-specific\nfinetuning. Overall, MEDITRON achieves a 6% absolute performance gain over the\nbest public baseline in its parameter class and 3% over the strongest baseline\nwe finetuned from Llama-2. Compared to closed-source LLMs, MEDITRON-70B\noutperforms GPT-3.5 and Med-PaLM and is within 5% of GPT-4 and 10% of\nMed-PaLM-2. We release our code for curating the medical pretraining corpus and\nthe MEDITRON model weights to drive open-source development of more capable\nmedical LLMs.\n","authors":["Zeming Chen","Alejandro Hernández Cano","Angelika Romanou","Antoine Bonnet","Kyle Matoba","Francesco Salvi","Matteo Pagliardini","Simin Fan","Andreas Köpf","Amirkeivan Mohtashami","Alexandre Sallinen","Alireza Sakhaeirad","Vinitra Swamy","Igor Krawczuk","Deniz Bayazit","Axel Marmet","Syrielle Montariol","Mary-Anne Hartley","Martin Jaggi","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2311.16079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14309v2","updated":"2023-11-27T18:48:33Z","published":"2022-11-25T18:59:53Z","title":"FutureHuman3D: Forecasting Complex Long-Term 3D Human Behavior from\n Video Observations","summary":" We present a generative approach to forecast long-term future human behavior\nin 3D, requiring only weak supervision from readily available 2D human action\ndata. This is a fundamental task enabling many downstream applications. The\nrequired ground-truth data is hard to capture in 3D (mocap suits, expensive\nsetups) but easy to acquire in 2D (simple RGB cameras). Thus, we design our\nmethod to only require 2D RGB data while being able to generate 3D human motion\nsequences. We use a differentiable 2D projection scheme in an autoregressive\nmanner for weak supervision, and an adversarial loss for 3D regularization. Our\nmethod predicts long and complex behavior sequences (e.g. cooking, assembly)\nconsisting of multiple sub-actions. We tackle this in a semantically\nhierarchical manner, jointly predicting high-level coarse action labels\ntogether with their low-level fine-grained realizations as characteristic 3D\nhuman poses. We observe that these two action representations are coupled in\nnature, and joint prediction benefits both action and pose forecasting. Our\nexperiments demonstrate the complementary nature of joint action and 3D pose\nprediction: our joint approach outperforms each task treated individually,\nenables robust longer-term sequence prediction, and outperforms alternative\napproaches to forecast actions and characteristic 3D poses.\n","authors":["Christian Diller","Thomas Funkhouser","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2211.14309v2.pdf","comment":"Project Page: https://future-human-3d.christian-diller.de/ Video:\n https://www.youtube.com/watch?v=18du85YFXL0"},{"id":"http://arxiv.org/abs/2311.16065v1","updated":"2023-11-27T18:32:08Z","published":"2023-11-27T18:32:08Z","title":"A Survey on Vulnerability of Federated Learning: A Learning Algorithm\n Perspective","summary":" This review paper takes a comprehensive look at malicious attacks against FL,\ncategorizing them from new perspectives on attack origins and targets, and\nproviding insights into their methodology and impact. In this survey, we focus\non threat models targeting the learning process of FL systems. Based on the\nsource and target of the attack, we categorize existing threat models into four\ntypes, Data to Model (D2M), Model to Data (M2D), Model to Model (M2M) and\ncomposite attacks. For each attack type, we discuss the defense strategies\nproposed, highlighting their effectiveness, assumptions and potential areas for\nimprovement. Defense strategies have evolved from using a singular metric to\nexcluding malicious clients, to employing a multifaceted approach examining\nclient models at various phases. In this survey paper, our research indicates\nthat the to-learn data, the learning gradients, and the learned model at\ndifferent stages all can be manipulated to initiate malicious attacks that\nrange from undermining model performance, reconstructing private local data,\nand to inserting backdoors. We have also seen these threat are becoming more\ninsidious. While earlier studies typically amplified malicious gradients,\nrecent endeavors subtly alter the least significant weights in local models to\nbypass defense measures. This literature review provides a holistic\nunderstanding of the current FL threat landscape and highlights the importance\nof developing robust, efficient, and privacy-preserving defenses to ensure the\nsafe and trusted adoption of FL in real-world applications.\n","authors":["Xianghua Xie","Chen Hu","Hanchi Ren","Jingjing Deng"],"pdf_url":"https://arxiv.org/pdf/2311.16065v1.pdf","comment":"https://github.com/Rand2AI/Awesome-Vulnerability-of-Federated-Learning"},{"id":"http://arxiv.org/abs/2311.14078v2","updated":"2023-11-27T18:31:15Z","published":"2023-11-23T16:12:00Z","title":"Machine learning-based decentralized TDMA for VLC IoT networks","summary":" In this paper, a machine learning-based decentralized time division multiple\naccess (TDMA) algorithm for visible light communication (VLC) Internet of\nThings (IoT) networks is proposed. The proposed algorithm is based on\nQ-learning, a reinforcement learning algorithm. This paper considers a\ndecentralized condition in which there is no coordinator node for sending\nsynchronization frames and assigning transmission time slots to other nodes.\nThe proposed algorithm uses a decentralized manner for synchronization, and\neach node uses the Q-learning algorithm to find the optimal transmission time\nslot for sending data without collisions. The proposed algorithm is implemented\non a VLC hardware system, which had been designed and implemented in our\nlaboratory. Average reward, convergence time, goodput, average delay, and data\npacket size are evaluated parameters. The results show that the proposed\nalgorithm converges quickly and provides collision-free decentralized TDMA for\nthe network. The proposed algorithm is compared with carrier-sense multiple\naccess with collision avoidance (CSMA/CA) algorithm as a potential selection\nfor decentralized VLC IoT networks. The results show that the proposed\nalgorithm provides up to 61% more goodput and up to 49% less average delay than\nCSMA/CA.\n","authors":["Armin Makvandi","Yousef Seifi Kavian"],"pdf_url":"https://arxiv.org/pdf/2311.14078v2.pdf","comment":"This work has been submitted to a journal for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2210.06462v3","updated":"2023-11-27T18:30:14Z","published":"2022-10-12T17:57:58Z","title":"Self-Guided Diffusion Models","summary":" Diffusion models have demonstrated remarkable progress in image generation\nquality, especially when guidance is used to control the generative process.\nHowever, guidance requires a large amount of image-annotation pairs for\ntraining and is thus dependent on their availability, correctness and\nunbiasedness. In this paper, we eliminate the need for such annotation by\ninstead leveraging the flexibility of self-supervision signals to design a\nframework for self-guided diffusion models. By leveraging a feature extraction\nfunction and a self-annotation function, our method provides guidance signals\nat various image granularities: from the level of holistic images to object\nboxes and even segmentation masks. Our experiments on single-label and\nmulti-label image datasets demonstrate that self-labeled guidance always\noutperforms diffusion models without guidance and may even surpass guidance\nbased on ground-truth labels, especially on unbalanced data. When equipped with\nself-supervised box or mask proposals, our method further generates visually\ndiverse yet semantically consistent images, without the need for any class,\nbox, or segment label annotation. Self-guided diffusion is simple, flexible and\nexpected to profit from deployment at scale. Source code will be at:\nhttps://taohu.me/sgdm/\n","authors":["Vincent Tao Hu","David W Zhang","Yuki M. Asano","Gertjan J. Burghouts","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2210.06462v3.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2112.12589v3","updated":"2023-11-27T18:29:31Z","published":"2021-12-20T13:46:39Z","title":"A deep reinforcement learning model for predictive maintenance planning\n of road assets: Integrating LCA and LCCA","summary":" Road maintenance planning is an integral part of road asset management. One\nof the main challenges in Maintenance and Rehabilitation (M&R) practices is to\ndetermine maintenance type and timing. This research proposes a framework using\nReinforcement Learning (RL) based on the Long Term Pavement Performance (LTPP)\ndatabase to determine the type and timing of M&R practices. A predictive DNN\nmodel is first developed in the proposed algorithm, which serves as the\nEnvironment for the RL algorithm. For the Policy estimation of the RL model,\nboth DQN and PPO models are developed. However, PPO has been selected in the\nend due to better convergence and higher sample efficiency. Indicators used in\nthis study are International Roughness Index (IRI) and Rutting Depth (RD).\nInitially, we considered Cracking Metric (CM) as the third indicator, but it\nwas then excluded due to the much fewer data compared to other indicators,\nwhich resulted in lower accuracy of the results. Furthermore, in\ncost-effectiveness calculation (reward), we considered both the economic and\nenvironmental impacts of M&R treatments. Costs and environmental impacts have\nbeen evaluated with paLATE 2.0 software. Our method is tested on a hypothetical\ncase study of a six-lane highway with 23 kilometers length located in Texas,\nwhich has a warm and wet climate. The results propose a 20-year M&R plan in\nwhich road condition remains in an excellent condition range. Because the early\nstate of the road is at a good level of service, there is no need for heavy\nmaintenance practices in the first years. Later, after heavy M&R actions, there\nare several 1-2 years of no need for treatments. All of these show that the\nproposed plan has a logical result. Decision-makers and transportation agencies\ncan use this scheme to conduct better maintenance practices that can prevent\nbudget waste and, at the same time, minimize the environmental impacts.\n","authors":["Moein Latifi","Fateme Golivand Darvishvand","Omid Khandel","Mobin Latifi Nowsoud"],"pdf_url":"https://arxiv.org/pdf/2112.12589v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.08805v3","updated":"2023-11-27T18:24:41Z","published":"2021-11-16T22:16:03Z","title":"Online Estimation and Optimization of Utility-Based Shortfall Risk","summary":" Utility-Based Shortfall Risk (UBSR) is a risk metric that is increasingly\npopular in financial applications, owing to certain desirable properties that\nit enjoys. We consider the problem of estimating UBSR in a recursive setting,\nwhere samples from the underlying loss distribution are available\none-at-a-time. We cast the UBSR estimation problem as a root finding problem,\nand propose stochastic approximation-based estimations schemes. We derive\nnon-asymptotic bounds on the estimation error in the number of samples. We also\nconsider the problem of UBSR optimization within a parameterized class of\nrandom variables. We propose a stochastic gradient descent based algorithm for\nUBSR optimization, and derive non-asymptotic bounds on its convergence.\n","authors":["Vishwajit Hegde","Arvind S. Menon","L. A. Prashanth","Krishna Jagannathan"],"pdf_url":"https://arxiv.org/pdf/2111.08805v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16054v1","updated":"2023-11-27T18:19:07Z","published":"2023-11-27T18:19:07Z","title":"Metric Space Magnitude for Evaluating Unsupervised Representation\n Learning","summary":" The magnitude of a metric space was recently established as a novel\ninvariant, providing a measure of the `effective size' of a space across\nmultiple scales. By capturing both geometrical and topological properties of\ndata, magnitude is poised to address challenges in unsupervised representation\nlearning tasks. We formalise a novel notion of dissimilarity between magnitude\nfunctions of finite metric spaces and use them to derive a quality measure for\ndimensionality reduction tasks. Our measure is provably stable under\nperturbations of the data, can be efficiently calculated, and enables a\nrigorous multi-scale comparison of embeddings. We show the utility of our\nmeasure in an experimental suite that comprises different domains and tasks,\nincluding the comparison of data visualisations.\n","authors":["Katharina Limbeck","Rayna Andreeva","Rik Sarkar","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2311.16054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16038v1","updated":"2023-11-27T17:59:41Z","published":"2023-11-27T17:59:41Z","title":"OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving","summary":" Understanding how the 3D scene evolves is vital for making decisions in\nautonomous driving. Most existing methods achieve this by predicting the\nmovements of object boxes, which cannot capture more fine-grained scene\ninformation. In this paper, we explore a new framework of learning a world\nmodel, OccWorld, in the 3D Occupancy space to simultaneously predict the\nmovement of the ego car and the evolution of the surrounding scenes. We propose\nto learn a world model based on 3D occupancy rather than 3D bounding boxes and\nsegmentation maps for three reasons: 1) expressiveness. 3D occupancy can\ndescribe the more fine-grained 3D structure of the scene; 2) efficiency. 3D\noccupancy is more economical to obtain (e.g., from sparse LiDAR points). 3)\nversatility. 3D occupancy can adapt to both vision and LiDAR. To facilitate the\nmodeling of the world evolution, we learn a reconstruction-based scene\ntokenizer on the 3D occupancy to obtain discrete scene tokens to describe the\nsurrounding scenes. We then adopt a GPT-like spatial-temporal generative\ntransformer to generate subsequent scene and ego tokens to decode the future\noccupancy and ego trajectory. Extensive experiments on the widely used nuScenes\nbenchmark demonstrate the ability of OccWorld to effectively model the\nevolution of the driving scenes. OccWorld also produces competitive planning\nresults without using instance and map supervision. Code:\nhttps://github.com/wzzheng/OccWorld.\n","authors":["Wenzhao Zheng","Weiliang Chen","Yuanhui Huang","Borui Zhang","Yueqi Duan","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2311.16038v1.pdf","comment":"Code is available at: https://github.com/wzzheng/OccWorld"},{"id":"http://arxiv.org/abs/2308.00709v2","updated":"2023-11-27T17:57:18Z","published":"2023-07-28T22:52:15Z","title":"DeepTSF: Codeless machine learning operations for time series\n forecasting","summary":" This paper presents DeepTSF, a comprehensive machine learning operations\n(MLOps) framework aiming to innovate time series forecasting through workflow\nautomation and codeless modeling. DeepTSF automates key aspects of the ML\nlifecycle, making it an ideal tool for data scientists and MLops engineers\nengaged in machine learning (ML) and deep learning (DL)-based forecasting.\nDeepTSF empowers users with a robust and user-friendly solution, while it is\ndesigned to seamlessly integrate with existing data analysis workflows,\nproviding enhanced productivity and compatibility. The framework offers a\nfront-end user interface (UI) suitable for data scientists, as well as other\nhigher-level stakeholders, enabling comprehensive understanding through\ninsightful visualizations and evaluation metrics. DeepTSF also prioritizes\nsecurity through identity management and access authorization mechanisms. The\napplication of DeepTSF in real-life use cases of the I-NERGY project has\nalready proven DeepTSF's efficacy in DL-based load forecasting, showcasing its\nsignificant added value in the electrical power and energy systems domain.\n","authors":["Sotiris Pelekis","Evangelos Karakolis","Theodosios Pountridis","George Kormpakis","George Lampropoulos","Spiros Mouzakitis","Dimitris Askounis"],"pdf_url":"https://arxiv.org/pdf/2308.00709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16035v1","updated":"2023-11-27T17:55:50Z","published":"2023-11-27T17:55:50Z","title":"RobustState: Boosting Fidelity of Quantum State Preparation via\n Noise-Aware Variational Training","summary":" Quantum state preparation, a crucial subroutine in quantum computing,\ninvolves generating a target quantum state from initialized qubits. Arbitrary\nstate preparation algorithms can be broadly categorized into arithmetic\ndecomposition (AD) and variational quantum state preparation (VQSP). AD employs\na predefined procedure to decompose the target state into a series of gates,\nwhereas VQSP iteratively tunes ansatz parameters to approximate target state.\nVQSP is particularly apt for Noisy-Intermediate Scale Quantum (NISQ) machines\ndue to its shorter circuits. However, achieving noise-robust parameter\noptimization still remains challenging.\n We present RobustState, a novel VQSP training methodology that combines high\nrobustness with high training efficiency. The core idea involves utilizing\nmeasurement outcomes from real machines to perform back-propagation through\nclassical simulators, thus incorporating real quantum noise into gradient\ncalculations. RobustState serves as a versatile, plug-and-play technique\napplicable for training parameters from scratch or fine-tuning existing\nparameters to enhance fidelity on target machines. It is adaptable to various\nansatzes at both gate and pulse levels and can even benefit other variational\nalgorithms, such as variational unitary synthesis.\n Comprehensive evaluation of RobustState on state preparation tasks for 4\ndistinct quantum algorithms using 10 real quantum machines demonstrates a\ncoherent error reduction of up to 7.1 $\\times$ and state fidelity improvement\nof up to 96\\% and 81\\% for 4-Q and 5-Q states, respectively. On average,\nRobustState improves fidelity by 50\\% and 72\\% for 4-Q and 5-Q states compared\nto baseline approaches.\n","authors":["Hanrui Wang","Yilian Liu","Pengyu Liu","Jiaqi Gu","Zirui Li","Zhiding Liang","Jinglei Cheng","Yongshan Ding","Xuehai Qian","Yiyu Shi","David Z. Pan","Frederic T. Chong","Song Han"],"pdf_url":"https://arxiv.org/pdf/2311.16035v1.pdf","comment":"Accepted to FASTML @ ICCAD 2023. 14 pages, 20 figures"},{"id":"http://arxiv.org/abs/2311.16030v1","updated":"2023-11-27T17:50:14Z","published":"2023-11-27T17:50:14Z","title":"Machine Learning-Enhanced Aircraft Landing Scheduling under\n Uncertainties","summary":" This paper addresses aircraft delays, emphasizing their impact on safety and\nfinancial losses. To mitigate these issues, an innovative machine learning\n(ML)-enhanced landing scheduling methodology is proposed, aiming to improve\nautomation and safety. Analyzing flight arrival delay scenarios reveals strong\nmultimodal distributions and clusters in arrival flight time durations. A\nmulti-stage conditional ML predictor enhances separation time prediction based\non flight events. ML predictions are then integrated as safety constraints in a\ntime-constrained traveling salesman problem formulation, solved using\nmixed-integer linear programming (MILP). Historical flight recordings and model\npredictions address uncertainties between successive flights, ensuring\nreliability. The proposed method is validated using real-world data from the\nAtlanta Air Route Traffic Control Center (ARTCC ZTL). Case studies demonstrate\nan average 17.2% reduction in total landing time compared to the\nFirst-Come-First-Served (FCFS) rule. Unlike FCFS, the proposed methodology\nconsiders uncertainties, instilling confidence in scheduling. The study\nconcludes with remarks and outlines future research directions.\n","authors":["Yutian Pang","Peng Zhao","Jueming Hu","Yongming Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16026v1","updated":"2023-11-27T17:40:02Z","published":"2023-11-27T17:40:02Z","title":"A Neural Framework for Generalized Causal Sensitivity Analysis","summary":" Unobserved confounding is common in many applications, making causal\ninference from observational data challenging. As a remedy, causal sensitivity\nanalysis is an important tool to draw causal conclusions under unobserved\nconfounding with mathematical guarantees. In this paper, we propose NeuralCSA,\na neural framework for generalized causal sensitivity analysis. Unlike previous\nwork, our framework is compatible with (i) a large class of sensitivity models,\nincluding the marginal sensitivity model, f-sensitivity models, and Rosenbaum's\nsensitivity model; (ii) different treatment types (i.e., binary and\ncontinuous); and (iii) different causal queries, including (conditional)\naverage treatment effects and simultaneous effects on multiple outcomes. The\ngenerality of \\frameworkname is achieved by learning a latent distribution\nshift that corresponds to a treatment intervention using two conditional\nnormalizing flows. We provide theoretical guarantees that NeuralCSA is able to\ninfer valid bounds on the causal query of interest and also demonstrate this\nempirically using both simulated and real-world data.\n","authors":["Dennis Frauen","Fergus Imrie","Alicia Curth","Valentyn Melnychuk","Stefan Feuerriegel","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2311.16026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13258v2","updated":"2023-11-27T17:36:19Z","published":"2023-10-20T03:34:31Z","title":"ManiCast: Collaborative Manipulation with Cost-Aware Human Forecasting","summary":" Seamless human-robot manipulation in close proximity relies on accurate\nforecasts of human motion. While there has been significant progress in\nlearning forecast models at scale, when applied to manipulation tasks, these\nmodels accrue high errors at critical transition points leading to degradation\nin downstream planning performance. Our key insight is that instead of\npredicting the most likely human motion, it is sufficient to produce forecasts\nthat capture how future human motion would affect the cost of a robot's plan.\nWe present ManiCast, a novel framework that learns cost-aware human forecasts\nand feeds them to a model predictive control planner to execute collaborative\nmanipulation tasks. Our framework enables fluid, real-time interactions between\na human and a 7-DoF robot arm across a number of real-world tasks such as\nreactive stirring, object handovers, and collaborative table setting. We\nevaluate both the motion forecasts and the end-to-end forecaster-planner system\nagainst a range of learned and heuristic baselines while additionally\ncontributing new datasets. We release our code and datasets at\nhttps://portal-cornell.github.io/manicast/.\n","authors":["Kushal Kedia","Prithwish Dan","Atiksh Bhardwaj","Sanjiban Choudhury"],"pdf_url":"https://arxiv.org/pdf/2310.13258v2.pdf","comment":"CoRL 2023"},{"id":"http://arxiv.org/abs/2311.16021v1","updated":"2023-11-27T17:35:28Z","published":"2023-11-27T17:35:28Z","title":"Scheduling and Communication Schemes for Decentralized Federated\n Learning","summary":" Federated learning (FL) is a distributed machine learning paradigm in which a\nlarge number of clients coordinate with a central server to learn a model\nwithout sharing their own training data. One central server is not enough, due\nto problems of connectivity with clients. In this paper, a decentralized\nfederated learning (DFL) model with the stochastic gradient descent (SGD)\nalgorithm has been introduced, as a more scalable approach to improve the\nlearning performance in a network of agents with arbitrary topology. Three\nscheduling policies for DFL have been proposed for communications between the\nclients and the parallel servers, and the convergence, accuracy, and loss have\nbeen tested in a totally decentralized mplementation of SGD. The experimental\nresults show that the proposed scheduling polices have an impact both on the\nspeed of convergence and in the final global model.\n","authors":["Bahaa-Eldin Ali Abdelghany","Ana Fernández-Vilas","Manuel Fernández-Veiga","Nashwa El-Bendary","Ammar M. Hassan","Walid M. Abdelmoez"],"pdf_url":"https://arxiv.org/pdf/2311.16021v1.pdf","comment":"32nd International Conference on Computer Theory and Applications\n (ICCTA), Alexandria, Egypt, 2022"},{"id":"http://arxiv.org/abs/2203.09659v3","updated":"2023-11-27T17:23:10Z","published":"2022-03-17T23:52:08Z","title":"Low-degree learning and the metric entropy of polynomials","summary":" Let $\\mathscr{F}_{n,d}$ be the class of all functions $f:\\{-1,1\\}^n\\to[-1,1]$\non the $n$-dimensional discrete hypercube of degree at most $d$. In the first\npart of this paper, we prove that any (deterministic or randomized) algorithm\nwhich learns $\\mathscr{F}_{n,d}$ with $L_2$-accuracy $\\varepsilon$ requires at\nleast $\\Omega((1-\\sqrt{\\varepsilon})2^d\\log n)$ queries for large enough $n$,\nthus establishing the sharpness as $n\\to\\infty$ of a recent upper bound of\nEskenazis and Ivanisvili (2021). To do this, we show that the $L_2$-packing\nnumbers $\\mathsf{M}(\\mathscr{F}_{n,d},\\|\\cdot\\|_{L_2},\\varepsilon)$ of the\nconcept class $\\mathscr{F}_{n,d}$ satisfy the two-sided estimate\n$$c(1-\\varepsilon)2^d\\log n \\leq \\log\n\\mathsf{M}(\\mathscr{F}_{n,d},\\|\\cdot\\|_{L_2},\\varepsilon) \\leq \\frac{2^{Cd}\\log\nn}{\\varepsilon^4}$$ for large enough $n$, where $c, C>0$ are universal\nconstants. In the second part of the paper, we present a logarithmic upper\nbound for the randomized query complexity of classes of bounded approximate\npolynomials whose Fourier spectra are concentrated on few subsets. As an\napplication, we prove new estimates for the number of random queries required\nto learn approximate juntas of a given degree, functions with rapidly decaying\nFourier tails and constant depth circuits of given size. Finally, we obtain\nbounds for the number of queries required to learn the polynomial class\n$\\mathscr{F}_{n,d}$ without error in the query and random example models.\n","authors":["Alexandros Eskenazis","Paata Ivanisvili","Lauritz Streck"],"pdf_url":"https://arxiv.org/pdf/2203.09659v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11913v2","updated":"2023-11-27T17:17:39Z","published":"2023-11-20T16:44:18Z","title":"Deep Calibration of Market Simulations using Neural Density Estimators\n and Embedding Networks","summary":" The ability to construct a realistic simulator of financial exchanges,\nincluding reproducing the dynamics of the limit order book, can give insight\ninto many counterfactual scenarios, such as a flash crash, a margin call, or\nchanges in macroeconomic outlook. In recent years, agent-based models have been\ndeveloped that reproduce many features of an exchange, as summarised by a set\nof stylised facts and statistics. However, the ability to calibrate simulators\nto a specific period of trading remains an open challenge. In this work, we\ndevelop a novel approach to the calibration of market simulators by leveraging\nrecent advances in deep learning, specifically using neural density estimators\nand embedding networks. We demonstrate that our approach is able to correctly\nidentify high probability parameter sets, both when applied to synthetic and\nhistorical data, and without reliance on manually selected or weighted\nensembles of stylised facts.\n","authors":["Namid R. Stillman","Rory Baggott","Justin Lyon","Jianfei Zhang","Dingqiu Zhu","Tao Chen","Perukrishnen Vytelingum"],"pdf_url":"https://arxiv.org/pdf/2311.11913v2.pdf","comment":"4th ACM International Conference on AI in Finance (ICAIF 2023)"},{"id":"http://arxiv.org/abs/2211.14400v5","updated":"2023-11-27T17:13:24Z","published":"2022-11-25T23:32:26Z","title":"Optimal Approximation Rates for Deep ReLU Neural Networks on Sobolev and\n Besov Spaces","summary":" Let $\\Omega = [0,1]^d$ be the unit cube in $\\mathbb{R}^d$. We study the\nproblem of how efficiently, in terms of the number of parameters, deep neural\nnetworks with the ReLU activation function can approximate functions in the\nSobolev spaces $W^s(L_q(\\Omega))$ and Besov spaces $B^s_r(L_q(\\Omega))$, with\nerror measured in the $L_p(\\Omega)$ norm. This problem is important when\nstudying the application of neural networks in a variety of fields, including\nscientific computing and signal processing, and has previously been solved only\nwhen $p=q=\\infty$. Our contribution is to provide a complete solution for all\n$1\\leq p,q\\leq \\infty$ and $s > 0$ for which the corresponding Sobolev or Besov\nspace compactly embeds into $L_p$. The key technical tool is a novel\nbit-extraction technique which gives an optimal encoding of sparse vectors.\nThis enables us to obtain sharp upper bounds in the non-linear regime where $p\n> q$. We also provide a novel method for deriving $L_p$-approximation lower\nbounds based upon VC-dimension when $p < \\infty$. Our results show that very\ndeep ReLU networks significantly outperform classical methods of approximation\nin terms of the number of parameters, but that this comes at the cost of\nparameters which are not encodable.\n","authors":["Jonathan W. Siegel"],"pdf_url":"https://arxiv.org/pdf/2211.14400v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16008v1","updated":"2023-11-27T17:02:56Z","published":"2023-11-27T17:02:56Z","title":"Using Decentralized Aggregation for Federated Learning with Differential\n Privacy","summary":" Nowadays, the ubiquitous usage of mobile devices and networks have raised\nconcerns about the loss of control over personal data and research advance\ntowards the trade-off between privacy and utility in scenarios that combine\nexchange communications, big databases and distributed and collaborative (P2P)\nMachine Learning techniques. On the other hand, although Federated Learning\n(FL) provides some level of privacy by retaining the data at the local node,\nwhich executes a local training to enrich a global model, this scenario is\nstill susceptible to privacy breaches as membership inference attacks. To\nprovide a stronger level of privacy, this research deploys an experimental\nenvironment for FL with Differential Privacy (DP) using benchmark datasets. The\nobtained results show that the election of parameters and techniques of DP is\ncentral in the aforementioned trade-off between privacy and utility by means of\na classification example.\n","authors":["Hadeel Abd El-Kareem","Abd El-Moaty Saleh","Ana Fernández-Vilas","Manuel Fernández-Veiga","asser El-Sonbaty"],"pdf_url":"https://arxiv.org/pdf/2311.16008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06627v2","updated":"2023-11-27T16:59:39Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40\\% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16004v1","updated":"2023-11-27T16:55:04Z","published":"2023-11-27T16:55:04Z","title":"Improved Data Generation for Enhanced Asset Allocation: A Synthetic\n Dataset Approach for the Fixed Income Universe","summary":" We present a novel process for generating synthetic datasets tailored to\nassess asset allocation methods and construct portfolios within the fixed\nincome universe. Our approach begins by enhancing the CorrGAN model to generate\nsynthetic correlation matrices. Subsequently, we propose an Encoder-Decoder\nmodel that samples additional data conditioned on a given correlation matrix.\nThe resulting synthetic dataset facilitates in-depth analyses of asset\nallocation methods across diverse asset universes. Additionally, we provide a\ncase study that exemplifies the use of the synthetic dataset to improve\nportfolios constructed within a simulation-based asset allocation process.\n","authors":["Szymon Kubiak","Tillman Weyde","Oleksandr Galkin","Dan Philps","Ram Gopal"],"pdf_url":"https://arxiv.org/pdf/2311.16004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16003v1","updated":"2023-11-27T16:52:25Z","published":"2023-11-27T16:52:25Z","title":"Forecasting Auxiliary Energy Consumption for Electric Heavy-Duty\n Vehicles","summary":" Accurate energy consumption prediction is crucial for optimizing the\noperation of electric commercial heavy-duty vehicles, e.g., route planning for\ncharging. Moreover, understanding why certain predictions are cast is paramount\nfor such a predictive model to gain user trust and be deployed in practice.\nSince commercial vehicles operate differently as transportation tasks, ambient,\nand drivers vary, a heterogeneous population is expected when building an AI\nsystem for forecasting energy consumption. The dependencies between the input\nfeatures and the target values are expected to also differ across\nsub-populations. One well-known example of such a statistical phenomenon is the\nSimpson paradox. In this paper, we illustrate that such a setting poses a\nchallenge for existing XAI methods that produce global feature statistics, e.g.\nLIME or SHAP, causing them to yield misleading results. We demonstrate a\npotential solution by training multiple regression models on subsets of data.\nIt not only leads to superior regression performance but also more relevant and\nconsistent LIME explanations. Given that the employed groupings correspond to\nrelevant sub-populations, the associations between the input features and the\ntarget values are consistent within each cluster but different across clusters.\nExperiments on both synthetic and real-world datasets show that such splitting\nof a complex problem into simpler ones yields better regression performance and\ninterpretability.\n","authors":["Yuantao Fan","Zhenkan Wang","Sepideh Pashami","Slawomir Nowaczyk","Henrik Ydreskog"],"pdf_url":"https://arxiv.org/pdf/2311.16003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08736v2","updated":"2023-11-27T16:49:49Z","published":"2023-08-17T02:18:59Z","title":"On the Effectiveness of Log Representation for Log-based Anomaly\n Detection","summary":" Logs are an essential source of information for people to understand the\nrunning status of a software system. Due to the evolving modern software\narchitecture and maintenance methods, more research efforts have been devoted\nto automated log analysis. In particular, machine learning (ML) has been widely\nused in log analysis tasks. In ML-based log analysis tasks, converting textual\nlog data into numerical feature vectors is a critical and indispensable step.\nHowever, the impact of using different log representation techniques on the\nperformance of the downstream models is not clear, which limits researchers and\npractitioners' opportunities of choosing the optimal log representation\ntechniques in their automated log analysis workflows. Therefore, this work\ninvestigates and compares the commonly adopted log representation techniques\nfrom previous log analysis research. Particularly, we select six log\nrepresentation techniques and evaluate them with seven ML models and four\npublic log datasets (i.e., HDFS, BGL, Spirit and Thunderbird) in the context of\nlog-based anomaly detection. We also examine the impacts of the log parsing\nprocess and the different feature aggregation approaches when they are employed\nwith log representation techniques. From the experiments, we provide some\nheuristic guidelines for future researchers and developers to follow when\ndesigning an automated log analysis workflow. We believe our comprehensive\ncomparison of log representation techniques can help researchers and\npractitioners better understand the characteristics of different log\nrepresentation techniques and provide them with guidance for selecting the most\nsuitable ones for their ML-based log analysis workflow.\n","authors":["Xingfang Wu","Heng Li","Foutse Khomh"],"pdf_url":"https://arxiv.org/pdf/2308.08736v2.pdf","comment":"Accepted by Journal of Empirical Software Engineering (EMSE)"},{"id":"http://arxiv.org/abs/2307.06255v2","updated":"2023-11-27T16:48:14Z","published":"2023-07-12T15:50:38Z","title":"Machine learning and Topological data analysis identify unique features\n of human papillae in 3D scans","summary":" The tongue surface houses a range of papillae that are integral to the\nmechanics and chemistry of taste and textural sensation. Although gustatory\nfunction of papillae is well investigated, the uniqueness of papillae within\nand across individuals remains elusive. Here, we present the first machine\nlearning framework on 3D microscopic scans of human papillae (n = 2092),\nuncovering the uniqueness of geometric and topological features of papillae.\nThe finer differences in shapes of papillae are investigated computationally\nbased on a number of features derived from discrete differential geometry and\ncomputational topology. Interpretable machine learning techniques show that\npersistent homology features of the papillae shape are the most effective in\npredicting the biological variables. Models trained on these features with\nsmall volumes of data samples predict the type of papillae with an accuracy of\n85%. The papillae type classification models can map the spatial arrangement of\nfiliform and fungiform papillae on a surface. Remarkably, the papillae are\nfound to be distinctive across individuals and an individual can be identified\nwith an accuracy of 48% among the 15 participants from a single papillae.\nCollectively, this is the first unprecedented evidence demonstrating that\ntongue papillae can serve as a unique identifier inspiring new research\ndirection for food preferences and oral diagnostics.\n","authors":["Rayna Andreeva","Anwesha Sarkar","Rik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2307.06255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16001v1","updated":"2023-11-27T16:47:09Z","published":"2023-11-27T16:47:09Z","title":"Automated Measurement of Vascular Calcification in Femoral\n Endarterectomy Patients Using Deep Learning","summary":" Atherosclerosis, a chronic inflammatory disease affecting the large arteries,\npresents a global health risk. Accurate analysis of diagnostic images, like\ncomputed tomographic angiograms (CTAs), is essential for staging and monitoring\nthe progression of atherosclerosis-related conditions, including peripheral\narterial disease (PAD). However, manual analysis of CTA images is\ntime-consuming and tedious. To address this limitation, we employed a deep\nlearning model to segment the vascular system in CTA images of PAD patients\nundergoing femoral endarterectomy surgery and to measure vascular calcification\nfrom the left renal artery to the patella. Utilizing proprietary CTA images of\n27 patients undergoing femoral endarterectomy surgery provided by Prisma Health\nMidlands, we developed a Deep Neural Network (DNN) model to first segment the\narterial system, starting from the descending aorta to the patella, and second,\nto provide a metric of arterial calcification. Our designed DNN achieved 83.4%\naverage Dice accuracy in segmenting arteries from aorta to patella, advancing\nthe state-of-the-art by 0.8%. Furthermore, our work is the first to present a\nrobust statistical analysis of automated calcification measurement in the lower\nextremities using deep learning, attaining a Mean Absolute Percentage Error\n(MAPE) of 9.5% and a correlation coefficient of 0.978 between automated and\nmanual calcification scores. These findings underscore the potential of deep\nlearning techniques as a rapid and accurate tool for medical professionals to\nassess calcification in the abdominal aorta and its branches above the patella.\nThe developed DNN model and related documentation in this project are available\nat GitHub page at https://github.com/pip-alireza/DeepCalcScoring.\n","authors":["Alireza Bagheri Rajeoni","Breanna Pederson","Daniel G. Clair","Susan M. Lessner","Homayoun Valafar"],"pdf_url":"https://arxiv.org/pdf/2311.16001v1.pdf","comment":"Published in MDPI Diagnostic journal, the code can be accessed via\n the GitHub link in the paper"},{"id":"http://arxiv.org/abs/2310.10541v2","updated":"2023-11-27T16:45:18Z","published":"2023-10-16T16:13:53Z","title":"AST: Effective Dataset Distillation through Alignment with Smooth and\n High-Quality Expert Trajectories","summary":" Training large AI models typically requires large-scale datasets in the\nmachine learning process, making training and parameter-tuning process both\ntime-consuming and costly. Some researchers address this problem by carefully\nsynthesizing a very small number of highly representative and informative\nsamples from real-world datasets. This approach, known as Dataset Distillation\n(DD), proposes a perspective for data-efficient learning. Despite recent\nprogress in this field, the performance of existing methods still cannot meet\nexpectations, and distilled datasets cannot effectively replace original\ndatasets. In this paper, unlike previous methods that focus solely on improving\nthe effectiveness of student distillation, we recognize and leverage the\nimportant mutual influence between expert and student models. We observed that\nthe smoothness of expert trajectories has a significant impact on subsequent\nstudent parameter alignment. Based on this, we propose an effective DD\nframework named AST, standing for Alignment with Smooth and high-quality expert\nTrajectories. We devise the integration of clipping loss and gradient penalty\nto regulate the rate of parameter changes in expert trajectory generation. To\nfurther refine the student parameter alignment with expert trajectory, we put\nforward representative initialization for the synthetic dataset and balanced\ninner-loop loss in response to the sensitivity exhibited towards randomly\ninitialized variables during distillation. We also propose two enhancement\nstrategies, namely intermediate matching loss and weight perturbation, to\nmitigate the potential occurrence of cumulative errors. We conduct extensive\nexperiments on datasets of different scales, sizes, and resolutions. The\nresults demonstrate that the proposed method significantly outperforms prior\nmethods.\n","authors":["Jiyuan Shen","Wenzhuo Yang","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2310.10541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15996v1","updated":"2023-11-27T16:44:50Z","published":"2023-11-27T16:44:50Z","title":"Closing the ODE-SDE gap in score-based diffusion models through the\n Fokker-Planck equation","summary":" Score-based diffusion models have emerged as one of the most promising\nframeworks for deep generative modelling, due to their state-of-the art\nperformance in many generation tasks while relying on mathematical foundations\nsuch as stochastic differential equations (SDEs) and ordinary differential\nequations (ODEs). Empirically, it has been reported that ODE based samples are\ninferior to SDE based samples. In this paper we rigorously describe the range\nof dynamics and approximations that arise when training score-based diffusion\nmodels, including the true SDE dynamics, the neural approximations, the various\napproximate particle dynamics that result, as well as their associated\nFokker--Planck equations and the neural network approximations of these\nFokker--Planck equations. We systematically analyse the difference between the\nODE and SDE dynamics of score-based diffusion models, and link it to an\nassociated Fokker--Planck equation. We derive a theoretical upper bound on the\nWasserstein 2-distance between the ODE- and SDE-induced distributions in terms\nof a Fokker--Planck residual. We also show numerically that conventional\nscore-based diffusion models can exhibit significant differences between ODE-\nand SDE-induced distributions which we demonstrate using explicit comparisons.\nMoreover, we show numerically that reducing the Fokker--Planck residual by\nadding it as an additional regularisation term leads to closing the gap between\nODE- and SDE-induced distributions. Our experiments suggest that this\nregularisation can improve the distribution generated by the ODE, however that\nthis can come at the cost of degraded SDE sample quality.\n","authors":["Teo Deveney","Jan Stanczuk","Lisa Maria Kreusser","Chris Budd","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2311.15996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15995v1","updated":"2023-11-27T16:44:13Z","published":"2023-11-27T16:44:13Z","title":"Sensitivity-Based Layer Insertion for Residual and Feedforward Neural\n Networks","summary":" The training of neural networks requires tedious and often manual tuning of\nthe network architecture. We propose a systematic method to insert new layers\nduring the training process, which eliminates the need to choose a fixed\nnetwork size before training. Our technique borrows techniques from constrained\noptimization and is based on first-order sensitivity information of the\nobjective with respect to the virtual parameters that additional layers, if\ninserted, would offer. We consider fully connected feedforward networks with\nselected activation functions as well as residual neural networks. In numerical\nexperiments, the proposed sensitivity-based layer insertion technique exhibits\nimproved training decay, compared to not inserting the layer. Furthermore, the\ncomputational effort is reduced in comparison to inserting the layer from the\nbeginning. The code is available at\n\\url{https://github.com/LeonieKreis/layer_insertion_sensitivity_based}.\n","authors":["Evelyn Herberg","Roland Herzog","Frederik Köhne","Leonie Kreis","Anton Schiela"],"pdf_url":"https://arxiv.org/pdf/2311.15995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15990v1","updated":"2023-11-27T16:39:55Z","published":"2023-11-27T16:39:55Z","title":"Should We Learn Most Likely Functions or Parameters?","summary":" Standard regularized training procedures correspond to maximizing a posterior\ndistribution over parameters, known as maximum a posteriori (MAP) estimation.\nHowever, model parameters are of interest only insomuch as they combine with\nthe functional form of a model to provide a function that can make good\npredictions. Moreover, the most likely parameters under the parameter posterior\ndo not generally correspond to the most likely function induced by the\nparameter posterior. In fact, we can re-parametrize a model such that any\nsetting of parameters can maximize the parameter posterior. As an alternative,\nwe investigate the benefits and drawbacks of directly estimating the most\nlikely function implied by the model and the data. We show that this procedure\nleads to pathological solutions when using neural networks and prove conditions\nunder which the procedure is well-behaved, as well as a scalable approximation.\nUnder these conditions, we find that function-space MAP estimation can lead to\nflatter minima, better generalization, and improved robustness to overfitting.\n","authors":["Shikai Qiu","Tim G. J. Rudner","Sanyam Kapoor","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2311.15990v1.pdf","comment":"NeurIPS 2023. Code available at\n https://github.com/activatedgeek/function-space-map"},{"id":"http://arxiv.org/abs/2303.01486v4","updated":"2023-11-27T16:36:53Z","published":"2023-03-02T18:47:51Z","title":"Understanding plasticity in neural networks","summary":" Plasticity, the ability of a neural network to quickly change its predictions\nin response to new information, is essential for the adaptability and\nrobustness of deep reinforcement learning systems. Deep neural networks are\nknown to lose plasticity over the course of training even in relatively simple\nlearning problems, but the mechanisms driving this phenomenon are still poorly\nunderstood. This paper conducts a systematic empirical analysis into plasticity\nloss, with the goal of understanding the phenomenon mechanistically in order to\nguide the future development of targeted solutions. We find that loss of\nplasticity is deeply connected to changes in the curvature of the loss\nlandscape, but that it often occurs in the absence of saturated units. Based on\nthis insight, we identify a number of parameterization and optimization design\nchoices which enable networks to better preserve plasticity over the course of\ntraining. We validate the utility of these findings on larger-scale RL\nbenchmarks in the Arcade Learning Environment.\n","authors":["Clare Lyle","Zeyu Zheng","Evgenii Nikishin","Bernardo Avila Pires","Razvan Pascanu","Will Dabney"],"pdf_url":"https://arxiv.org/pdf/2303.01486v4.pdf","comment":"Accepted to ICML 2023 (oral presentation)"},{"id":"http://arxiv.org/abs/2311.15983v1","updated":"2023-11-27T16:28:20Z","published":"2023-11-27T16:28:20Z","title":"Sparsify-then-Classify: From Internal Neurons of Large Language Models\n To Efficient Text Classifiers","summary":" Among the many tasks that Large Language Models (LLMs) have revolutionized is\ntext classification. However, existing approaches for applying pretrained LLMs\nto text classification predominantly rely on using single token outputs from\nonly the last layer of hidden states. As a result, they suffer from limitations\nin efficiency, task-specificity, and interpretability. In our work, we\ncontribute an approach that uses all internal representations by employing\nmultiple pooling strategies on all activation and hidden states. Our novel\nlightweight strategy, Sparsify-then-Classify (STC) first sparsifies\ntask-specific features layer-by-layer, then aggregates across layers for text\nclassification. STC can be applied as a seamless plug-and-play module on top of\nexisting LLMs. Our experiments on a comprehensive set of models and datasets\ndemonstrate that STC not only consistently improves the classification\nperformance of pretrained and fine-tuned models, but is also more efficient for\nboth training and inference, and is more intrinsically interpretable.\n","authors":["Yilun Liu","Difan Jiao","Ashton Anderson"],"pdf_url":"https://arxiv.org/pdf/2311.15983v1.pdf","comment":"23 pages, 5 figures, 8 tables Code available at\n https://github.com/difanj0713/Sparsify-then-Classify"},{"id":"http://arxiv.org/abs/2311.15979v1","updated":"2023-11-27T16:25:12Z","published":"2023-11-27T16:25:12Z","title":"Soil Organic Carbon Estimation from Climate-related Features with Graph\n Neural Network","summary":" Soil organic carbon (SOC) plays a pivotal role in the global carbon cycle,\nimpacting climate dynamics and necessitating accurate estimation for\nsustainable land and agricultural management. While traditional methods of SOC\nestimation face resolution and accuracy challenges, recent technological\nsolutions harness remote sensing, machine learning, and high-resolution\nsatellite mapping. Graph Neural Networks (GNNs), especially when integrated\nwith positional encoders, can capture complex relationships between soil and\nclimate. Using the LUCAS database, this study compared four GNN operators in\nthe positional encoder framework. Results revealed that the PESAGE and\nPETransformer models outperformed others in SOC estimation, indicating their\npotential in capturing the complex relationship between SOC and climate\nfeatures. Our findings confirm the feasibility of applications of GNN\narchitectures in SOC prediction, establishing a framework for future\nexplorations of this topic with more advanced GNN models.\n","authors":["Weiying Zhao","Natalia Efremova"],"pdf_url":"https://arxiv.org/pdf/2311.15979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00553v3","updated":"2023-11-27T16:24:59Z","published":"2023-04-02T15:04:43Z","title":"From Isolated Islands to Pangea: Unifying Semantic Space for Human\n Action Understanding","summary":" As a vital step toward the intelligent agent, Action understanding matters\nfor intelligent agents and has attracted long-term attention. It can be formed\nas the mapping from the action physical space to the semantic space. Typically,\nresearchers built action datasets according to idiosyncratic choices to define\nclasses and push the envelope of benchmarks respectively. Thus, datasets are\nincompatible with each other like \"Isolated Islands\" due to semantic gaps and\nvarious class granularities, e.g., do housework in dataset A and wash plate in\ndataset B. We argue that a more principled semantic space is an urgent need to\nconcentrate the community efforts and enable us to use all datasets together to\npursue generalizable action learning. To this end, we design a structured\naction semantic space in view of verb taxonomy hierarchy and covering massive\nactions. By aligning the classes of previous datasets to our semantic space, we\ngather (image/video/skeleton/MoCap) datasets into a unified database in a\nunified label system, i.e., bridging ``isolated islands'' into a \"Pangea\".\nAccordingly, we propose a novel model mapping from the physical space to\nsemantic space to fully use Pangea. In extensive experiments, our new system\nshows significant superiority, especially in transfer learning. Code and data\nwill be made publicly available.\n","authors":["Yong-Lu Li","Xiaoqian Wu","Xinpeng Liu","Zehao Wang","Yiming Dou","Yikun Ji","Junyi Zhang","Yixing Li","Jingru Tan","Xudong Lu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00553v3.pdf","comment":"Project Webpage: https://mvig-rhos.com/pangea"},{"id":"http://arxiv.org/abs/2308.07037v4","updated":"2023-11-27T16:15:44Z","published":"2023-08-14T09:56:35Z","title":"Bayesian Flow Networks","summary":" This paper introduces Bayesian Flow Networks (BFNs), a new class of\ngenerative model in which the parameters of a set of independent distributions\nare modified with Bayesian inference in the light of noisy data samples, then\npassed as input to a neural network that outputs a second, interdependent\ndistribution. Starting from a simple prior and iteratively updating the two\ndistributions yields a generative procedure similar to the reverse process of\ndiffusion models; however it is conceptually simpler in that no forward process\nis required. Discrete and continuous-time loss functions are derived for\ncontinuous, discretised and discrete data, along with sample generation\nprocedures. Notably, the network inputs for discrete data lie on the\nprobability simplex, and are therefore natively differentiable, paving the way\nfor gradient-based sample guidance and few-step generation in discrete domains\nsuch as language modelling. The loss function directly optimises data\ncompression and places no restrictions on the network architecture. In our\nexperiments BFNs achieve competitive log-likelihoods for image modelling on\ndynamically binarized MNIST and CIFAR-10, and outperform all known discrete\ndiffusion models on the text8 character-level language modelling task.\n","authors":["Alex Graves","Rupesh Kumar Srivastava","Timothy Atkinson","Faustino Gomez"],"pdf_url":"https://arxiv.org/pdf/2308.07037v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.05400v5","updated":"2023-11-27T16:08:49Z","published":"2022-03-10T14:45:57Z","title":"Asymptotic Bounds for Smoothness Parameter Estimates in Gaussian Process\n Interpolation","summary":" It is common to model a deterministic response function, such as the output\nof a computer experiment, as a Gaussian process with a Mat\\'ern covariance\nkernel. The smoothness parameter of a Mat\\'ern kernel determines many important\nproperties of the model in the large data limit, including the rate of\nconvergence of the conditional mean to the response function. We prove that the\nmaximum likelihood estimate of the smoothness parameter cannot asymptotically\nundersmooth the truth when the data are obtained on a fixed bounded subset of\n$\\mathbb{R}^d$. That is, if the data-generating response function has Sobolev\nsmoothness $\\nu_0 > d/2$, then the smoothness parameter estimate cannot be\nasymptotically less than $\\nu_0$. The lower bound is sharp. Additionally, we\nshow that maximum likelihood estimation recovers the true smoothness for a\nclass of compactly supported self-similar functions. For cross-validation we\nprove an asymptotic lower bound $\\nu_0 - d/2$, which however is unlikely to be\nsharp. The results are based on approximation theory in Sobolev spaces and some\ngeneral theorems that restrict the set of values that the parameter estimators\ncan take.\n","authors":["Toni Karvonen"],"pdf_url":"https://arxiv.org/pdf/2203.05400v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15966v1","updated":"2023-11-27T16:07:49Z","published":"2023-11-27T16:07:49Z","title":"Towards Transfer Learning for Large-Scale Image Classification Using\n Annealing-based Quantum Boltzmann Machines","summary":" Quantum Transfer Learning (QTL) recently gained popularity as a hybrid\nquantum-classical approach for image classification tasks by efficiently\ncombining the feature extraction capabilities of large Convolutional Neural\nNetworks with the potential benefits of Quantum Machine Learning (QML).\nExisting approaches, however, only utilize gate-based Variational Quantum\nCircuits for the quantum part of these procedures. In this work we present an\napproach to employ Quantum Annealing (QA) in QTL-based image classification.\nSpecifically, we propose using annealing-based Quantum Boltzmann Machines as\npart of a hybrid quantum-classical pipeline to learn the classification of\nreal-world, large-scale data such as medical images through supervised\ntraining. We demonstrate our approach by applying it to the three-class\nCOVID-CT-MD dataset, a collection of lung Computed Tomography (CT) scan slices.\nUsing Simulated Annealing as a stand-in for actual QA, we compare our method to\nclassical transfer learning, using a neural network of the same order of\nmagnitude, to display its improved classification performance. We find that our\napproach consistently outperforms its classical baseline in terms of test\naccuracy and AUC-ROC-Score and needs less training epochs to do this.\n","authors":["Daniëlle Schuman","Leo Sünkel","Philipp Altmann","Jonas Stein","Christoph Roch","Thomas Gabor","Claudia Linnhoff-Popien"],"pdf_url":"https://arxiv.org/pdf/2311.15966v1.pdf","comment":"7 pages, 3 figures (5 if counting subfigures), 1 table. To be\n published in the proceedings of the 2023 IEEE International Conference on\n Quantum Computing and Engineering (QCE)"},{"id":"http://arxiv.org/abs/2311.15964v1","updated":"2023-11-27T16:07:37Z","published":"2023-11-27T16:07:37Z","title":"Efficient Pre-training for Localized Instruction Generation of Videos","summary":" Procedural videos show step-by-step demonstrations of tasks like recipe\npreparation. Understanding such videos is challenging, involving the precise\nlocalization of steps and the generation of textual instructions. Manually\nannotating steps and writing instructions is costly, which limits the size of\ncurrent datasets and hinders effective learning. Leveraging large but noisy\nvideo-transcript datasets for pre-training can boost performance, but demands\nsignificant computational resources. Furthermore, transcripts contain\nirrelevant content and exhibit style variation compared to instructions written\nby human annotators. To mitigate both issues, we propose a technique,\nSieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters\nirrelevant transcripts and (ii) Swap enhances the quality of the text\ninstruction by automatically replacing the transcripts with human-written\ninstructions from a text-only recipe dataset. The curated dataset, three orders\nof magnitude smaller than current web-scale datasets, enables efficient\ntraining of large-scale models with competitive performance. We complement our\nSieve-\\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step\nlocalization and instruction generation for procedural videos. When this model\nis pre-trained on our curated dataset, it achieves state-of-the-art performance\nin zero-shot and finetuning settings on YouCook2 and Tasty, while using a\nfraction of the computational resources.\n","authors":["Anil Batra","Davide Moltisanti","Laura Sevilla-Lara","Marcus Rohrbach","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2311.15964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15961v1","updated":"2023-11-27T16:06:48Z","published":"2023-11-27T16:06:48Z","title":"Maximum Likelihood Estimation is All You Need for Well-Specified\n Covariate Shift","summary":" A key challenge of modern machine learning systems is to achieve\nOut-of-Distribution (OOD) generalization -- generalizing to target data whose\ndistribution differs from that of source data. Despite its significant\nimportance, the fundamental question of ``what are the most effective\nalgorithms for OOD generalization'' remains open even under the standard\nsetting of covariate shift. This paper addresses this fundamental question by\nproving that, surprisingly, classical Maximum Likelihood Estimation (MLE)\npurely using source data (without any modification) achieves the minimax\noptimality for covariate shift under the well-specified setting. That is, no\nalgorithm performs better than MLE in this setting (up to a constant factor),\njustifying MLE is all you need. Our result holds for a very rich class of\nparametric models, and does not require any boundedness condition on the\ndensity ratio. We illustrate the wide applicability of our framework by\ninstantiating it to three concrete examples -- linear regression, logistic\nregression, and phase retrieval. This paper further complement the study by\nproving that, under the misspecified setting, MLE is no longer the optimal\nchoice, whereas Maximum Weighted Likelihood Estimator (MWLE) emerges as minimax\noptimal in certain scenarios.\n","authors":["Jiawei Ge","Shange Tang","Jianqing Fan","Cong Ma","Chi Jin"],"pdf_url":"https://arxiv.org/pdf/2311.15961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15960v1","updated":"2023-11-27T16:06:39Z","published":"2023-11-27T16:06:39Z","title":"Addressing Long-Horizon Tasks by Integrating Program Synthesis and State\n Machines","summary":" Deep reinforcement learning excels in various domains but lacks\ngeneralizability and interoperability. Programmatic RL methods (Trivedi et al.,\n2021; Liu et al., 2023) reformulate solving RL tasks as synthesizing\ninterpretable programs that can be executed in the environments. Despite\nencouraging results, these methods are limited to short-horizon tasks. On the\nother hand, representing RL policies using state machines (Inala et al., 2020)\ncan inductively generalize to long-horizon tasks; however, it struggles to\nscale up to acquire diverse and complex behaviors. This work proposes Program\nMachine Policies (POMPs), which bridge the advantages of programmatic RL and\nstate machine policies, allowing for the representation of complex behaviors\nand the address of long-term tasks. Specifically, we introduce a method that\ncan retrieve a set of effective, diverse, compatible programs. Then, we use\nthese programs as modes of a state machine and learn a transition function to\ntransition among mode programs, allowing for capturing long-horizon repetitive\nbehaviors. Our proposed framework outperforms programmatic RL and deep RL\nbaselines on various tasks and demonstrates the ability to generalize to even\nlonger horizons without any fine-tuning inductively. Ablation studies justify\nthe effectiveness of our proposed search algorithm for retrieving a set of\nprograms as modes.\n","authors":["Yu-An Lin","Chen-Tao Lee","Guan-Ting Liu","Pu-Jen Cheng","Shao-Hua Sun"],"pdf_url":"https://arxiv.org/pdf/2311.15960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.09347v3","updated":"2023-11-27T15:59:55Z","published":"2022-03-17T14:26:28Z","title":"Dimensionality Reduction and Wasserstein Stability for Kernel Regression","summary":" In a high-dimensional regression framework, we study consequences of the\nnaive two-step procedure where first the dimension of the input variables is\nreduced and second, the reduced input variables are used to predict the output\nvariable with kernel regression. In order to analyze the resulting regression\nerrors, a novel stability result for kernel regression with respect to the\nWasserstein distance is derived. This allows us to bound errors that occur when\nperturbed input data is used to fit the regression function. We apply the\ngeneral stability result to principal component analysis (PCA). Exploiting\nknown estimates from the literature on both principal component analysis and\nkernel regression, we deduce convergence rates for the two-step procedure. The\nlatter turns out to be particularly useful in a semi-supervised setting.\n","authors":["Stephan Eckstein","Armin Iske","Mathias Trabs"],"pdf_url":"https://arxiv.org/pdf/2203.09347v3.pdf","comment":"Forthcoming in JMLR"},{"id":"http://arxiv.org/abs/2311.10093v2","updated":"2023-11-27T15:58:30Z","published":"2023-11-16T18:59:51Z","title":"The Chosen One: Consistent Characters in Text-to-Image Diffusion Models","summary":" Recent advances in text-to-image generation models have unlocked vast\npotential for visual creativity. However, these models struggle with generation\nof consistent characters, a crucial aspect for numerous real-world applications\nsuch as story visualization, game development asset design, advertising, and\nmore. Current methods typically rely on multiple pre-existing images of the\ntarget character or involve labor-intensive manual processes. In this work, we\npropose a fully automated solution for consistent character generation, with\nthe sole input being a text prompt. We introduce an iterative procedure that,\nat each stage, identifies a coherent set of images sharing a similar identity\nand extracts a more consistent identity from this set. Our quantitative\nanalysis demonstrates that our method strikes a better balance between prompt\nalignment and identity consistency compared to the baseline methods, and these\nfindings are reinforced by a user study. To conclude, we showcase several\npractical applications of our approach. Project page is available at\nhttps://omriavrahami.com/the-chosen-one\n","authors":["Omri Avrahami","Amir Hertz","Yael Vinker","Moab Arar","Shlomi Fruchter","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2311.10093v2.pdf","comment":"Project page is available at https://omriavrahami.com/the-chosen-one"},{"id":"http://arxiv.org/abs/2311.15951v1","updated":"2023-11-27T15:57:11Z","published":"2023-11-27T15:57:11Z","title":"Replay across Experiments: A Natural Extension of Off-Policy RL","summary":" Replaying data is a principal mechanism underlying the stability and data\nefficiency of off-policy reinforcement learning (RL). We present an effective\nyet simple framework to extend the use of replays across multiple experiments,\nminimally adapting the RL workflow for sizeable improvements in controller\nperformance and research iteration times. At its core, Replay Across\nExperiments (RaE) involves reusing experience from previous experiments to\nimprove exploration and bootstrap learning while reducing required changes to a\nminimum in comparison to prior work. We empirically show benefits across a\nnumber of RL algorithms and challenging control domains spanning both\nlocomotion and manipulation, including hard exploration tasks from egocentric\nvision. Through comprehensive ablations, we demonstrate robustness to the\nquality and amount of data available and various hyperparameter choices.\nFinally, we discuss how our approach can be applied more broadly across\nresearch life cycles and can increase resilience by reloading data across\nrandom seeds or hyperparameter variations.\n","authors":["Dhruva Tirumala","Thomas Lampe","Jose Enrique Chen","Tuomas Haarnoja","Sandy Huang","Guy Lever","Ben Moran","Tim Hertweck","Leonard Hasenclever","Martin Riedmiller","Nicolas Heess","Markus Wulfmeier"],"pdf_url":"https://arxiv.org/pdf/2311.15951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00577v2","updated":"2023-11-27T15:57:06Z","published":"2023-06-01T11:45:45Z","title":"TorchRL: A data-driven decision-making library for PyTorch","summary":" PyTorch has ascended as a premier machine learning framework, yet it lacks a\nnative and comprehensive library for decision and control tasks suitable for\nlarge development teams dealing with complex real-world data and environments.\nTo address this issue, we propose TorchRL, a generalistic control library for\nPyTorch that provides well-integrated, yet standalone components. We introduce\na new and flexible PyTorch primitive, the TensorDict, which facilitates\nstreamlined algorithm development across the many branches of Reinforcement\nLearning (RL) and control. We provide a detailed description of the building\nblocks and an extensive overview of the library across domains and tasks.\nFinally, we experimentally demonstrate its reliability and flexibility and show\ncomparative benchmarks to demonstrate its computational efficiency. TorchRL\nfosters long-term support and is publicly available on GitHub for greater\nreproducibility and collaboration within the research community. The code is\nopen-sourced on GitHub.\n","authors":["Albert Bou","Matteo Bettini","Sebastian Dittert","Vikash Kumar","Shagun Sodhani","Xiaomeng Yang","Gianni De Fabritiis","Vincent Moens"],"pdf_url":"https://arxiv.org/pdf/2306.00577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15947v1","updated":"2023-11-27T15:54:20Z","published":"2023-11-27T15:54:20Z","title":"GloNets: Globally Connected Neural Networks","summary":" Deep learning architectures suffer from depth-related performance\ndegradation, limiting the effective depth of neural networks. Approaches like\nResNet are able to mitigate this, but they do not completely eliminate the\nproblem. We introduce Globally Connected Neural Networks (GloNet), a novel\narchitecture overcoming depth-related issues, designed to be superimposed on\nany model, enhancing its depth without increasing complexity or reducing\nperformance. With GloNet, the network's head uniformly receives information\nfrom all parts of the network, regardless of their level of abstraction. This\nenables GloNet to self-regulate information flow during training, reducing the\ninfluence of less effective deeper layers, and allowing for stable training\nirrespective of network depth. This paper details GloNet's design, its\ntheoretical basis, and a comparison with existing similar architectures.\nExperiments show GloNet's self-regulation ability and resilience to\ndepth-related learning challenges, like performance degradation. Our findings\nsuggest GloNet as a strong alternative to traditional architectures like\nResNets.\n","authors":["Antonio Di Cecco","Carlo Metta","Marco Fantozzi","Francesco Morandin","Maurizio Parton"],"pdf_url":"https://arxiv.org/pdf/2311.15947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15945v1","updated":"2023-11-27T15:51:07Z","published":"2023-11-27T15:51:07Z","title":"Over-Squashing in Riemannian Graph Neural Networks","summary":" Most graph neural networks (GNNs) are prone to the phenomenon of\nover-squashing in which node features become insensitive to information from\ndistant nodes in the graph. Recent works have shown that the topology of the\ngraph has the greatest impact on over-squashing, suggesting graph rewiring\napproaches as a suitable solution. In this work, we explore whether\nover-squashing can be mitigated through the embedding space of the GNN. In\nparticular, we consider the generalization of Hyperbolic GNNs (HGNNs) to\nRiemannian manifolds of variable curvature in which the geometry of the\nembedding space is faithful to the graph's topology. We derive bounds on the\nsensitivity of the node features in these Riemannian GNNs as the number of\nlayers increases, which yield promising theoretical and empirical results for\nalleviating over-squashing in graphs with negative curvature.\n","authors":["Julia Balla"],"pdf_url":"https://arxiv.org/pdf/2311.15945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15940v1","updated":"2023-11-27T15:47:33Z","published":"2023-11-27T15:47:33Z","title":"Physics-informed neural networks for transformed geometries and\n manifolds","summary":" Physics-informed neural networks (PINNs) effectively embed physical\nprinciples into machine learning, but often struggle with complex or\nalternating geometries. We propose a novel method for integrating geometric\ntransformations within PINNs to robustly accommodate geometric variations. Our\nmethod incorporates a diffeomorphism as a mapping of a reference domain and\nadapts the derivative computation of the physics-informed loss function. This\ngeneralizes the applicability of PINNs not only to smoothly deformed domains,\nbut also to lower-dimensional manifolds and allows for direct shape\noptimization while training the network. We demonstrate the effectivity of our\napproach on several problems: (i) Eikonal equation on Archimedean spiral, (ii)\nPoisson problem on surface manifold, (iii) Incompressible Stokes flow in\ndeformed tube, and (iv) Shape optimization with Laplace operator. Through these\nexamples, we demonstrate the enhanced flexibility over traditional PINNs,\nespecially under geometric variations. The proposed framework presents an\noutlook for training deep neural operators over parametrized geometries, paving\nthe way for advanced modeling with PDEs on complex geometries in science and\nengineering.\n","authors":["Samuel Burbulla"],"pdf_url":"https://arxiv.org/pdf/2311.15940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15936v1","updated":"2023-11-27T15:45:02Z","published":"2023-11-27T15:45:02Z","title":"Towards Responsible Governance of Biological Design Tools","summary":" Recent advancements in generative machine learning have enabled rapid\nprogress in biological design tools (BDTs) such as protein structure and\nsequence prediction models. The unprecedented predictive accuracy and novel\ndesign capabilities of BDTs present new and significant dual-use risks. For\nexample, their predictive accuracy allows biological agents, whether vaccines\nor pathogens, to be developed more quickly, while the design capabilities could\nbe used to discover drugs or evade DNA screening techniques. Similar to other\ndual-use AI systems, BDTs present a wicked problem: how can regulators uphold\npublic safety without stifling innovation? We highlight how current regulatory\nproposals that are primarily tailored toward large language models may be less\neffective for BDTs, which require fewer computational resources to train and\nare often developed in an open-source manner. We propose a range of measures to\nmitigate the risk that BDTs are misused, across the areas of responsible\ndevelopment, risk assessment, transparency, access management, cybersecurity,\nand investing in resilience. Implementing such measures will require close\ncoordination between developers and governments.\n","authors":["Richard Moulange","Max Langenkamp","Tessa Alexanian","Samuel Curtis","Morgan Livingston"],"pdf_url":"https://arxiv.org/pdf/2311.15936v1.pdf","comment":"10 pages + references, 1 figure, accepted at NeurIPS 2023 Regulatable\n ML as oral presentation"},{"id":"http://arxiv.org/abs/2307.06431v2","updated":"2023-11-27T15:38:32Z","published":"2023-07-12T19:51:49Z","title":"Energy Discrepancies: A Score-Independent Loss for Energy-Based Models","summary":" Energy-based models are a simple yet powerful class of probabilistic models,\nbut their widespread adoption has been limited by the computational burden of\ntraining them. We propose a novel loss function called Energy Discrepancy (ED)\nwhich does not rely on the computation of scores or expensive Markov chain\nMonte Carlo. We show that ED approaches the explicit score matching and\nnegative log-likelihood loss under different limits, effectively interpolating\nbetween both. Consequently, minimum ED estimation overcomes the problem of\nnearsightedness encountered in score-based estimation methods, while also\nenjoying theoretical guarantees. Through numerical experiments, we demonstrate\nthat ED learns low-dimensional data distributions faster and more accurately\nthan explicit score matching or contrastive divergence. For high-dimensional\nimage data, we describe how the manifold hypothesis puts limitations on our\napproach and demonstrate the effectiveness of energy discrepancy by training\nthe energy-based model as a prior of a variational decoder model.\n","authors":["Tobias Schröder","Zijing Ou","Jen Ning Lim","Yingzhen Li","Sebastian J. Vollmer","Andrew B. Duncan"],"pdf_url":"https://arxiv.org/pdf/2307.06431v2.pdf","comment":"Camera Ready version for the 37th Conference on Neural Information\n Processing Systems (NeurIPS 2023). Changes in this revision: Appendix A1:\n Corrected proof of Theorem 1. Appendix D3: Added definition and numerical\n experiments for energy discrepancy on binary discrete spaces. Minor changes\n in the main text and correction of typos. Added new references"},{"id":"http://arxiv.org/abs/2311.15925v1","updated":"2023-11-27T15:37:05Z","published":"2023-11-27T15:37:05Z","title":"Reinforcement Learning for Wildfire Mitigation in Simulated Disaster\n Environments","summary":" Climate change has resulted in a year over year increase in adverse weather\nand weather conditions which contribute to increasingly severe fire seasons.\nWithout effective mitigation, these fires pose a threat to life, property,\necology, cultural heritage, and critical infrastructure. To better prepare for\nand react to the increasing threat of wildfires, more accurate fire modelers\nand mitigation responses are necessary. In this paper, we introduce SimFire, a\nversatile wildland fire projection simulator designed to generate realistic\nwildfire scenarios, and SimHarness, a modular agent-based machine learning\nwrapper capable of automatically generating land management strategies within\nSimFire to reduce the overall damage to the area. Together, this publicly\navailable system allows researchers and practitioners the ability to emulate\nand assess the effectiveness of firefighter interventions and formulate\nstrategic plans that prioritize value preservation and resource allocation\noptimization. The repositories are available for download at\nhttps://github.com/mitrefireline.\n","authors":["Alexander Tapley","Marissa Dotter","Michael Doyle","Aidan Fennelly","Dhanuj Gandikota","Savanna Smith","Michael Threet","Tim Welsh"],"pdf_url":"https://arxiv.org/pdf/2311.15925v1.pdf","comment":"12 pages, 4 figures including Appendices (A, B). Accepted as a paper\n in the Proposals track at the \"Tackling Climate Change with Machine Learning\"\n workshop at NeurIPS 2023. MITRE Public Release Case Number 23-3920"},{"id":"http://arxiv.org/abs/2311.15924v1","updated":"2023-11-27T15:34:40Z","published":"2023-11-27T15:34:40Z","title":"Diagnosis driven Anomaly Detection for CPS","summary":" In Cyber-Physical Systems (CPS) research, anomaly detection (detecting\nabnormal behavior) and diagnosis (identifying the underlying root cause) are\noften treated as distinct, isolated tasks. However, diagnosis algorithms\nrequire symptoms, i.e. temporally and spatially isolated anomalies, as input.\nThus, anomaly detection and diagnosis must be developed together to provide a\nholistic solution for diagnosis in CPS. We therefore propose a method for\nutilizing deep learning-based anomaly detection to generate inputs for\nConsistency-Based Diagnosis (CBD). We evaluate our approach on a simulated and\na real-world CPS dataset, where our model demonstrates strong performance\nrelative to other state-of-the-art models.\n","authors":["Henrik S. Steude","Lukas Moddemann","Alexander Diedrich","Jonas Ehrhardt","Oliver Niggemann"],"pdf_url":"https://arxiv.org/pdf/2311.15924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07590v2","updated":"2023-11-27T15:17:49Z","published":"2023-11-09T17:12:44Z","title":"Technical Report: Large Language Models can Strategically Deceive their\n Users when Put Under Pressure","summary":" We demonstrate a situation in which Large Language Models, trained to be\nhelpful, harmless, and honest, can display misaligned behavior and\nstrategically deceive their users about this behavior without being instructed\nto do so. Concretely, we deploy GPT-4 as an agent in a realistic, simulated\nenvironment, where it assumes the role of an autonomous stock trading agent.\nWithin this environment, the model obtains an insider tip about a lucrative\nstock trade and acts upon it despite knowing that insider trading is\ndisapproved of by company management. When reporting to its manager, the model\nconsistently hides the genuine reasons behind its trading decision. We perform\na brief investigation of how this behavior varies under changes to the setting,\nsuch as removing model access to a reasoning scratchpad, attempting to prevent\nthe misaligned behavior by changing system instructions, changing the amount of\npressure the model is under, varying the perceived risk of getting caught, and\nmaking other simple changes to the environment. To our knowledge, this is the\nfirst demonstration of Large Language Models trained to be helpful, harmless,\nand honest, strategically deceiving their users in a realistic situation\nwithout direct instructions or training for deception.\n","authors":["Jérémy Scheurer","Mikita Balesni","Marius Hobbhahn"],"pdf_url":"https://arxiv.org/pdf/2311.07590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15906v1","updated":"2023-11-27T15:13:02Z","published":"2023-11-27T15:13:02Z","title":"MetaDefa: Meta-learning based on Domain Enhancement and Feature\n Alignment for Single Domain Generalization","summary":" The single domain generalization(SDG) based on meta-learning has emerged as\nan effective technique for solving the domain-shift problem. However, the\ninadequate match of data distribution between source and augmented domains and\ndifficult separation of domain-invariant features from domain-related features\nmake SDG model hard to achieve great generalization. Therefore, a novel\nmeta-learning method based on domain enhancement and feature alignment\n(MetaDefa) is proposed to improve the model generalization performance. First,\nthe background substitution and visual corruptions techniques are used to\ngenerate diverse and effective augmented domains. Then, the multi-channel\nfeature alignment module based on class activation maps and class agnostic\nactivation maps is designed to effectively extract adequate transferability\nknowledge. In this module, domain-invariant features can be fully explored by\nfocusing on similar target regions between source and augmented domains feature\nspace and suppressing the feature representation of non-similar target regions.\nExtensive experiments on two publicly available datasets show that MetaDefa has\nsignificant generalization performance advantages in unknown multiple target\ndomains.\n","authors":["Can Sun","Hao Zheng","Zhigang Hu","Liu Yang","Meiguang Zheng","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2311.15906v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2111.08239v2","updated":"2023-11-27T15:10:18Z","published":"2021-11-16T05:49:56Z","title":"Assessing Deep Neural Networks as Probability Estimators","summary":" Deep Neural Networks (DNNs) have performed admirably in classification tasks.\nHowever, the characterization of their classification uncertainties, required\nfor certain applications, has been lacking. In this work, we investigate the\nissue by assessing DNNs' ability to estimate conditional probabilities and\npropose a framework for systematic uncertainty characterization. Denoting the\ninput sample as x and the category as y, the classification task of assigning a\ncategory y to a given input x can be reduced to the task of estimating the\nconditional probabilities p(y|x), as approximated by the DNN at its last layer\nusing the softmax function. Since softmax yields a vector whose elements all\nfall in the interval (0, 1) and sum to 1, it suggests a probabilistic\ninterpretation to the DNN's outcome. Using synthetic and real-world datasets,\nwe look into the impact of various factors, e.g., probability density f(x) and\ninter-categorical sparsity, on the precision of DNNs' estimations of p(y|x),\nand find that the likelihood probability density and the inter-categorical\nsparsity have greater impacts than the prior probability to DNNs'\nclassification uncertainty.\n","authors":["Yu Pan","Kwo-Sen Kuo","Michael L. Rilee","Hongfeng Yu"],"pdf_url":"https://arxiv.org/pdf/2111.08239v2.pdf","comment":"Y. Pan, K. Kuo, M. Rilee and H. Yu, \"Assessing Deep Neural Networks\n as Probability Estimators,\" in 2021 IEEE International Conference on Big Data\n (Big Data), Orlando, FL, USA, 2021 pp. 1083-1091. doi:\n 10.1109/BigData52589.2021.9671328"},{"id":"http://arxiv.org/abs/2311.15890v1","updated":"2023-11-27T14:56:47Z","published":"2023-11-27T14:56:47Z","title":"Stability-Informed Initialization of Neural Ordinary Differential\n Equations","summary":" This paper addresses the training of Neural Ordinary Differential Equations\n(neural ODEs), and in particular explores the interplay between numerical\nintegration techniques, stability regions, step size, and initialization\ntechniques. It is shown how the choice of integration technique implicitly\nregularizes the learned model, and how the solver's corresponding stability\nregion affects training and prediction performance. From this analysis, a\nstability-informed parameter initialization technique is introduced. The\neffectiveness of the initialization method is displayed across several learning\nbenchmarks and industrial applications.\n","authors":["Theodor Westny","Arman Mohammadi","Daniel Jung","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2311.15890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15887v1","updated":"2023-11-27T14:55:16Z","published":"2023-11-27T14:55:16Z","title":"FLASC: A Flare-Sensitive Clustering Algorithm: Extending HDBSCAN* for\n Detecting Branches in Clusters","summary":" We present FLASC, an algorithm for flare-sensitive clustering. Our algorithm\nbuilds upon HDBSCAN* -- which provides high-quality density-based clustering\nperformance -- through a post-processing step that differentiates branches\nwithin the detected clusters' manifold, adding a type of pattern that can be\ndiscovered. Two variants of the algorithm are presented, which trade\ncomputational cost for noise robustness. We show that both variants scale\nsimilarly to HDBSCAN* in terms of computational cost and provide stable outputs\nusing synthetic data sets, resulting in an efficient flare-sensitive clustering\nalgorithm. In addition, we demonstrate the algorithm's benefit in data\nexploration over HDBSCAN* clustering on two real-world data sets.\n","authors":["D. M. Bot","J. Peeters","J. Liesenborgs","J. Aerts"],"pdf_url":"https://arxiv.org/pdf/2311.15887v1.pdf","comment":"20 pages, 11 figures, submitted to ACM TKDD"},{"id":"http://arxiv.org/abs/2311.15876v1","updated":"2023-11-27T14:49:06Z","published":"2023-11-27T14:49:06Z","title":"RO-LLaMA: Generalist LLM for Radiation Oncology via Noise Augmentation\n and Consistency Regularization","summary":" Recent advancements in Artificial Intelligence (AI) have profoundly\ninfluenced medical fields, by providing tools to reduce clinical workloads.\nHowever, most AI models are constrained to execute uni-modal tasks, in stark\ncontrast to the comprehensive approaches utilized by medical professionals. To\naddress this, here we present RO-LLaMA, a versatile generalist large language\nmodel (LLM) tailored for the field of radiation oncology. This model seamlessly\ncovers a wide range of the workflow of radiation oncologists, adept at various\ntasks such as clinical report summarization, radiation therapy plan suggestion,\nand plan-guided therapy target volume segmentation. In particular, to maximize\nthe end-to-end performance, we further present a novel Consistency Embedding\nFine-Tuning (CEFTune) technique, which boosts LLM's robustness to additional\nerrors at the intermediates while preserving the capability of handling clean\ninputs, and creatively transform this concept into LLM-driven segmentation\nframework as Consistency Embedding Segmentation (CESEG). Experimental results\non multi-centre cohort sets demonstrate our proposed RO-LLaMA's promising\nperformance for diverse tasks with generalization capabilities.\n","authors":["Kwanyoung Kim","Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Jin Sung Kim","Yong Bae Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15875v1","updated":"2023-11-27T14:48:37Z","published":"2023-11-27T14:48:37Z","title":"Nodal Hydraulic Head Estimation through Unscented Kalman Filter for\n Data-driven Leak Localization in Water Networks","summary":" In this paper, we present a nodal hydraulic head estimation methodology for\nwater distribution networks (WDN) based on an Unscented Kalman Filter (UKF)\nscheme with application to leak localization. The UKF refines an initial\nestimation of the hydraulic state by considering the prediction model, as well\nas available pressure and demand measurements. To this end, it provides\ncustomized prediction and data assimilation steps. Additionally, the method is\nenhanced by dynamically updating the prediction function weight matrices.\nPerformance testing on the Modena benchmark under realistic conditions\ndemonstrates the method's effectiveness in enhancing state estimation and\ndata-driven leak localization.\n","authors":["Luis Romero-Ben","Paul Irofti","Florin Stoican","Vicenç Puig"],"pdf_url":"https://arxiv.org/pdf/2311.15875v1.pdf","comment":"This work has been submitted to IFAC for possible publication. It has\n 6 pages and 3 figures"},{"id":"http://arxiv.org/abs/2306.00349v2","updated":"2023-11-27T14:42:52Z","published":"2023-06-01T05:06:56Z","title":"CALICO: Self-Supervised Camera-LiDAR Contrastive Pre-training for BEV\n Perception","summary":" Perception is crucial in the realm of autonomous driving systems, where\nbird's eye view (BEV)-based architectures have recently reached\nstate-of-the-art performance. The desirability of self-supervised\nrepresentation learning stems from the expensive and laborious process of\nannotating 2D and 3D data. Although previous research has investigated\npretraining methods for both LiDAR and camera-based 3D object detection, a\nunified pretraining framework for multimodal BEV perception is missing. In this\nstudy, we introduce CALICO, a novel framework that applies contrastive\nobjectives to both LiDAR and camera backbones. Specifically, CALICO\nincorporates two stages: point-region contrast (PRC) and region-aware\ndistillation (RAD). PRC better balances the region- and scene-level\nrepresentation learning on the LiDAR modality and offers significant\nperformance improvement compared to existing methods. RAD effectively achieves\ncontrastive distillation on our self-trained teacher model. CALICO's efficacy\nis substantiated by extensive evaluations on 3D object detection and BEV map\nsegmentation tasks, where it delivers significant performance improvements.\nNotably, CALICO outperforms the baseline method by 10.5% and 8.6% on NDS and\nmAP. Moreover, CALICO boosts the robustness of multimodal 3D object detection\nagainst adversarial attacks and corruption. Additionally, our framework can be\ntailored to different backbones and heads, positioning it as a promising\napproach for multimodal BEV perception.\n","authors":["Jiachen Sun","Haizhong Zheng","Qingzhao Zhang","Atul Prakash","Z. Morley Mao","Chaowei Xiao"],"pdf_url":"https://arxiv.org/pdf/2306.00349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15176v2","updated":"2023-11-27T14:35:05Z","published":"2023-07-27T20:11:07Z","title":"RCT Rejection Sampling for Causal Estimation Evaluation","summary":" Confounding is a significant obstacle to unbiased estimation of causal\neffects from observational data. For settings with high-dimensional covariates\n-- such as text data, genomics, or the behavioral social sciences --\nresearchers have proposed methods to adjust for confounding by adapting machine\nlearning methods to the goal of causal estimation. However, empirical\nevaluation of these adjustment methods has been challenging and limited. In\nthis work, we build on a promising empirical evaluation strategy that\nsimplifies evaluation design and uses real data: subsampling randomized\ncontrolled trials (RCTs) to create confounded observational datasets while\nusing the average causal effects from the RCTs as ground-truth. We contribute a\nnew sampling algorithm, which we call RCT rejection sampling, and provide\ntheoretical guarantees that causal identification holds in the observational\ndata to allow for valid comparisons to the ground-truth RCT. Using synthetic\ndata, we show our algorithm indeed results in low bias when oracle estimators\nare evaluated on the confounded samples, which is not always the case for a\npreviously proposed algorithm. In addition to this identification result, we\nhighlight several finite data considerations for evaluation designers who plan\nto use RCT rejection sampling on their own datasets. As a proof of concept, we\nimplement an example evaluation pipeline and walk through these finite data\nconsiderations with a novel, real-world RCT -- which we release publicly --\nconsisting of approximately 70k observations and text data as high-dimensional\ncovariates. Together, these contributions build towards a broader agenda of\nimproved empirical evaluation for causal estimation.\n","authors":["Katherine A. Keith","Sergey Feldman","David Jurgens","Jonathan Bragg","Rohit Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.15176v2.pdf","comment":"Code and data at https://github.com/kakeith/rct_rejection_sampling"},{"id":"http://arxiv.org/abs/2311.15865v1","updated":"2023-11-27T14:33:21Z","published":"2023-11-27T14:33:21Z","title":"A precise symbolic emulator of the linear matter power spectrum","summary":" Computing the matter power spectrum, $P(k)$, as a function of cosmological\nparameters can be prohibitively slow in cosmological analyses, hence emulating\nthis calculation is desirable. Previous analytic approximations are\ninsufficiently accurate for modern applications, so black-box, uninterpretable\nemulators are often used. We utilise an efficient genetic programming based\nsymbolic regression framework to explore the space of potential mathematical\nexpressions which can approximate the power spectrum and $\\sigma_8$. We learn\nthe ratio between an existing low-accuracy fitting function for $P(k)$ and that\nobtained by solving the Boltzmann equations and thus still incorporate the\nphysics which motivated this earlier approximation. We obtain an analytic\napproximation to the linear power spectrum with a root mean squared fractional\nerror of 0.2% between $k = 9\\times10^{-3} - 9 \\, h{\\rm \\, Mpc^{-1}}$ and across\na wide range of cosmological parameters, and we provide physical\ninterpretations for various terms in the expression. We also provide a simple\nanalytic approximation for $\\sigma_8$ with a similar accuracy, with a root mean\nsquared fractional error of just 0.4% when evaluated across the same range of\ncosmologies. This function is easily invertible to obtain $A_{\\rm s}$ as a\nfunction of $\\sigma_8$ and the other cosmological parameters, if preferred. It\nis possible to obtain symbolic approximations to a seemingly complex function\nat a precision required for current and future cosmological analyses without\nresorting to deep-learning techniques, thus avoiding their black-box nature and\nlarge number of parameters. Our emulator will be usable long after the codes on\nwhich numerical approximations are built become outdated.\n","authors":["Deaglan J. Bartlett","Lukas Kammerer","Gabriel Kronberger","Harry Desmond","Pedro G. Ferreira","Benjamin D. Wandelt","Bogdan Burlacu","David Alonso","Matteo Zennaro"],"pdf_url":"https://arxiv.org/pdf/2311.15865v1.pdf","comment":"9 pages, 5 figures. Submitted to A&A"},{"id":"http://arxiv.org/abs/2311.15858v1","updated":"2023-11-27T14:25:40Z","published":"2023-11-27T14:25:40Z","title":"Multi-Agent Reinforcement Learning for Power Control in Wireless\n Networks via Adaptive Graphs","summary":" The ever-increasing demand for high-quality and heterogeneous wireless\ncommunication services has driven extensive research on dynamic optimization\nstrategies in wireless networks. Among several possible approaches, multi-agent\ndeep reinforcement learning (MADRL) has emerged as a promising method to\naddress a wide range of complex optimization problems like power control.\nHowever, the seamless application of MADRL to a variety of network optimization\nproblems faces several challenges related to convergence. In this paper, we\npresent the use of graphs as communication-inducing structures among\ndistributed agents as an effective means to mitigate these challenges.\nSpecifically, we harness graph neural networks (GNNs) as neural architectures\nfor policy parameterization to introduce a relational inductive bias in the\ncollective decision-making process. Most importantly, we focus on modeling the\ndynamic interactions among sets of neighboring agents through the introduction\nof innovative methods for defining a graph-induced framework for integrated\ncommunication and learning. Finally, the superior generalization capabilities\nof the proposed methodology to larger networks and to networks with different\nuser categories is verified through simulations.\n","authors":["Lorenzo Mario Amorosa","Marco Skocaj","Roberto Verdone","Deniz Gündüz"],"pdf_url":"https://arxiv.org/pdf/2311.15858v1.pdf","comment":"6 pages, 4 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2311.15854v1","updated":"2023-11-27T14:21:47Z","published":"2023-11-27T14:21:47Z","title":"A systematic study comparing hyperparameter optimization engines on\n tabular data","summary":" We run an independent comparison of all hyperparameter optimization\n(hyperopt) engines available in the Ray Tune library. We introduce two ways to\nnormalize and aggregate statistics across data sets and models, one rank-based,\nand another one sandwiching the score between the random search score and the\nfull grid search score. This affords us i) to rank the hyperopt engines, ii) to\nmake generalized and statistically significant statements on how much they\nimprove over random search, and iii) to make recommendations on which engine\nshould be used to hyperopt a given learning algorithm. We find that most\nengines beat random search, but that only three of them (HEBO, AX, and\nBlendSearch) clearly stand out. We also found that some engines seem to\nspecialize in hyperopting certain learning algorithms, which makes it tricky to\nuse hyperopt in comparison studies, since the choice of the hyperopt technique\nmay favor some of the models in the comparison.\n","authors":["Balazs Kegl"],"pdf_url":"https://arxiv.org/pdf/2311.15854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15847v1","updated":"2023-11-27T14:12:51Z","published":"2023-11-27T14:12:51Z","title":"Cell Maps Representation For Lung Adenocarcinoma Growth Patterns\n Classification In Whole Slide Images","summary":" Lung adenocarcinoma is a morphologically heterogeneous disease, characterized\nby five primary histologic growth patterns. The quantity of these patterns can\nbe related to tumor behavior and has a significant impact on patient prognosis.\nIn this work, we propose a novel machine learning pipeline capable of\nclassifying tissue tiles into one of the five patterns or as non-tumor, with an\nArea Under the Receiver Operating Characteristic Curve (AUCROC) score of 0.97.\nOur model's strength lies in its comprehensive consideration of cellular\nspatial patterns, where it first generates cell maps from Hematoxylin and Eosin\n(H&E) whole slide images (WSIs), which are then fed into a convolutional neural\nnetwork classification model. Exploiting these cell maps provides the model\nwith robust generalizability to new data, achieving approximately 30% higher\naccuracy on unseen test-sets compared to current state of the art approaches.\nThe insights derived from our model can be used to predict prognosis, enhancing\npatient outcomes.\n","authors":["Arwa Al-Rubaian","Gozde N. Gunesli","Wajd A. Althakfi","Ayesha Azam","Nasir Rajpoot","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2311.15847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15838v1","updated":"2023-11-27T14:02:47Z","published":"2023-11-27T14:02:47Z","title":"Utilizing Explainability Techniques for Reinforcement Learning Model\n Assurance","summary":" Explainable Reinforcement Learning (XRL) can provide transparency into the\ndecision-making process of a Deep Reinforcement Learning (DRL) model and\nincrease user trust and adoption in real-world use cases. By utilizing XRL\ntechniques, researchers can identify potential vulnerabilities within a trained\nDRL model prior to deployment, therefore limiting the potential for mission\nfailure or mistakes by the system. This paper introduces the ARLIN (Assured RL\nModel Interrogation) Toolkit, an open-source Python library that identifies\npotential vulnerabilities and critical points within trained DRL models through\ndetailed, human-interpretable explainability outputs. To illustrate ARLIN's\neffectiveness, we provide explainability visualizations and vulnerability\nanalysis for a publicly available DRL model. The open-source code repository is\navailable for download at https://github.com/mitre/arlin.\n","authors":["Alexander Tapley","Kyle Gatesman","Luis Robaina","Brett Bissey","Joseph Weissman"],"pdf_url":"https://arxiv.org/pdf/2311.15838v1.pdf","comment":"9 pages, 8 figures including appendices (A, B, C). Accepted as a\n poster presentation in the demo track at the \"XAI in Action: Past, Present,\n and Future Applications\" workshop at NeurIPS 2023. MITRE Public Release Case\n Number 23-3095"},{"id":"http://arxiv.org/abs/2311.15831v1","updated":"2023-11-27T13:55:21Z","published":"2023-11-27T13:55:21Z","title":"Temporal Action Localization for Inertial-based Human Activity\n Recognition","summary":" A persistent trend in Deep Learning has been the applicability of machine\nlearning concepts to other areas than originally introduced for. As of today,\nstate-of-the-art activity recognition from wearable sensors relies on\nclassifiers being trained on fixed windows of data. Contrarily, video-based\nHuman Activity Recognition has followed a segment-based prediction approach,\nlocalizing activity occurrences from start to end. This paper is the first to\nsystematically demonstrate the applicability of state-of-the-art TAL models for\nwearable Human Activity Recongition (HAR) using raw inertial data as input. Our\nresults show that state-of-the-art TAL models are able to outperform popular\ninertial models on 4 out of 6 wearable activity recognition benchmark datasets,\nwith improvements ranging as much as 25% in F1-score. Introducing the TAL\ncommunity's most popular metric to inertial-based HAR, namely mean Average\nPrecision, our analysis shows that TAL models are able to produce more coherent\nsegments along with an overall higher NULL-class accuracy across all datasets.\nBeing the first to provide such an analysis, the TAL community offers an\ninteresting new perspective to inertial-based HAR with yet to be explored\ndesign choices and training concepts, which could be of significant value for\nthe inertial-based HAR community.\n","authors":["Marius Bock","Michael Moeller","Kristof Van Laerhoven"],"pdf_url":"https://arxiv.org/pdf/2311.15831v1.pdf","comment":"20 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2311.15816v1","updated":"2023-11-27T13:41:20Z","published":"2023-11-27T13:41:20Z","title":"Scale-Dropout: Estimating Uncertainty in Deep Neural Networks Using\n Stochastic Scale","summary":" Uncertainty estimation in Neural Networks (NNs) is vital in improving\nreliability and confidence in predictions, particularly in safety-critical\napplications. Bayesian Neural Networks (BayNNs) with Dropout as an\napproximation offer a systematic approach to quantifying uncertainty, but they\ninherently suffer from high hardware overhead in terms of power, memory, and\ncomputation. Thus, the applicability of BayNNs to edge devices with limited\nresources or to high-performance applications is challenging. Some of the\ninherent costs of BayNNs can be reduced by accelerating them in hardware on a\nComputation-In-Memory (CIM) architecture with spintronic memories and\nbinarizing their parameters. However, numerous stochastic units are required to\nimplement conventional dropout-based BayNN. In this paper, we propose the Scale\nDropout, a novel regularization technique for Binary Neural Networks (BNNs),\nand Monte Carlo-Scale Dropout (MC-Scale Dropout)-based BayNNs for efficient\nuncertainty estimation. Our approach requires only one stochastic unit for the\nentire model, irrespective of the model size, leading to a highly scalable\nBayesian NN. Furthermore, we introduce a novel Spintronic memory-based CIM\narchitecture for the proposed BayNN that achieves more than $100\\times$ energy\nsavings compared to the state-of-the-art. We validated our method to show up to\na $1\\%$ improvement in predictive performance and superior uncertainty\nestimates compared to related works.\n","authors":["Soyed Tuhin Ahmed","Kamal Danouchi","Michael Hefenbrock","Guillaume Prenat","Lorena Anghel","Mehdi B. Tahoori"],"pdf_url":"https://arxiv.org/pdf/2311.15816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15807v1","updated":"2023-11-27T13:30:20Z","published":"2023-11-27T13:30:20Z","title":"Exploring Artificial Intelligence Methods for Energy Prediction in\n Healthcare Facilities: An In-Depth Extended Systematic Review","summary":" Hospitals, due to their complexity and unique requirements, play a pivotal\nrole in global energy consumption patterns. This study conducted a\ncomprehensive literature review, utilizing the PRISMA framework, of articles\nthat employed machine learning and artificial intelligence techniques for\npredicting energy consumption in hospital buildings. Of the 1884 publications\nidentified, 17 were found to address this specific domain and have been\nthoroughly reviewed to establish the state-of-the-art and identify gaps where\nfuture research is needed. This review revealed a diverse range of data inputs\ninfluencing energy prediction, with occupancy and meteorological data emerging\nas significant predictors. However, many studies failed to delve deep into the\nimplications of their data choices, and gaps were evident regarding the\nunderstanding of time dynamics, operational status, and preprocessing methods.\nMachine learning, especially deep learning models like ANNs, have shown\npotential in this domain, yet they come with challenges, including\ninterpretability and computational demands. The findings underscore the immense\npotential of AI in optimizing hospital energy consumption but also highlight\nthe need for more comprehensive and granular research. Key areas for future\nresearch include the optimization of ANN approaches, new optimization and data\nintegration techniques, the integration of real-time data into Intelligent\nEnergy Management Systems, and increasing focus on long-term energy\nforecasting.\n","authors":["Marjan FatehiJananloo","Helen Stopps","J. J. McArthur"],"pdf_url":"https://arxiv.org/pdf/2311.15807v1.pdf","comment":"38 pages, 1 figure, 3 tables, systematic literature review"},{"id":"http://arxiv.org/abs/2311.15792v1","updated":"2023-11-27T13:14:39Z","published":"2023-11-27T13:14:39Z","title":"Rethinking Privacy in Machine Learning Pipelines from an Information\n Flow Control Perspective","summary":" Modern machine learning systems use models trained on ever-growing corpora.\nTypically, metadata such as ownership, access control, or licensing information\nis ignored during training. Instead, to mitigate privacy risks, we rely on\ngeneric techniques such as dataset sanitization and differentially private\nmodel training, with inherent privacy/utility trade-offs that hurt model\nperformance. Moreover, these techniques have limitations in scenarios where\nsensitive information is shared across multiple participants and fine-grained\naccess control is required. By ignoring metadata, we therefore miss an\nopportunity to better address security, privacy, and confidentiality\nchallenges. In this paper, we take an information flow control perspective to\ndescribe machine learning systems, which allows us to leverage metadata such as\naccess control policies and define clear-cut privacy and confidentiality\nguarantees with interpretable information flows. Under this perspective, we\ncontrast two different approaches to achieve user-level non-interference: 1)\nfine-tuning per-user models, and 2) retrieval augmented models that access\nuser-specific datasets at inference time. We compare these two approaches to a\ntrivially non-interfering zero-shot baseline using a public model and to a\nbaseline that fine-tunes this model on the whole corpus. We evaluate trained\nmodels on two datasets of scientific articles and demonstrate that retrieval\naugmented architectures deliver the best utility, scalability, and flexibility\nwhile satisfying strict non-interference guarantees.\n","authors":["Lukas Wutschitz","Boris Köpf","Andrew Paverd","Saravan Rajmohan","Ahmed Salem","Shruti Tople","Santiago Zanella-Béguelin","Menglin Xia","Victor Rühle"],"pdf_url":"https://arxiv.org/pdf/2311.15792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01276v2","updated":"2023-11-27T13:02:50Z","published":"2023-11-02T14:44:50Z","title":"Long-Range Neural Atom Learning for Molecular Graphs","summary":" Graph Neural Networks (GNNs) have been widely adopted for drug discovery with\nmolecular graphs. Nevertheless, current GNNs are mainly good at leveraging\nshort-range interactions (SRI) but struggle to capture long-range interactions\n(LRI), both of which are crucial for determining molecular properties. To\ntackle this issue, we propose a method that implicitly projects all original\natoms into a few Neural Atoms, which abstracts the collective information of\natomic groups within a molecule. Specifically, we explicitly exchange the\ninformation among neural atoms and project them back to the atoms'\nrepresentations as an enhancement. With this mechanism, neural atoms establish\nthe communication channels among distant nodes, effectively reducing the\ninteraction scope of arbitrary node pairs into a single hop. To provide an\ninspection of our method from a physical perspective, we reveal its connection\nwith the traditional LRI calculation method, Ewald Summation. We conduct\nextensive experiments on three long-range graph benchmarks, covering both\ngraph-level and link-level tasks on molecular graphs. We empirically justify\nthat our method can be equipped with an arbitrary GNN and help to capture LRI.\n","authors":["Xuan Li","Zhanke Zhou","Jiangchao Yao","Yu Rong","Lu Zhang","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2311.01276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10594v2","updated":"2023-11-27T13:02:06Z","published":"2023-03-19T07:53:31Z","title":"AdaptGuard: Defending Against Universal Attacks for Model Adaptation","summary":" Model adaptation aims at solving the domain transfer problem under the\nconstraint of only accessing the pretrained source models. With the increasing\nconsiderations of data privacy and transmission efficiency, this paradigm has\nbeen gaining recent popularity. This paper studies the vulnerability to\nuniversal attacks transferred from the source domain during model adaptation\nalgorithms due to the existence of malicious providers. We explore both\nuniversal adversarial perturbations and backdoor attacks as loopholes on the\nsource side and discover that they still survive in the target models after\nadaptation. To address this issue, we propose a model preprocessing framework,\nnamed AdaptGuard, to improve the security of model adaptation algorithms.\nAdaptGuard avoids direct use of the risky source parameters through knowledge\ndistillation and utilizes the pseudo adversarial samples under adjusted radius\nto enhance the robustness. AdaptGuard is a plug-and-play module that requires\nneither robust pretrained models nor any changes for the following model\nadaptation algorithms. Extensive results on three commonly used datasets and\ntwo popular adaptation methods validate that AdaptGuard can effectively defend\nagainst universal attacks and maintain clean accuracy in the target domain\nsimultaneously. We hope this research will shed light on the safety and\nrobustness of transfer learning. Code is available at\nhttps://github.com/TomSheng21/AdaptGuard.\n","authors":["Lijun Sheng","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2303.10594v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2311.15782v1","updated":"2023-11-27T12:55:39Z","published":"2023-11-27T12:55:39Z","title":"Relationship between Model Compression and Adversarial Robustness: A\n Review of Current Evidence","summary":" Increasing the model capacity is a known approach to enhance the adversarial\nrobustness of deep learning networks. On the other hand, various model\ncompression techniques, including pruning and quantization, can reduce the size\nof the network while preserving its accuracy. Several recent studies have\naddressed the relationship between model compression and adversarial\nrobustness, while some experiments have reported contradictory results. This\nwork summarizes available evidence and discusses possible explanations for the\nobserved effects.\n","authors":["Svetlana Pavlitska","Hannes Grolig","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.15782v1.pdf","comment":"Accepted for publication at SSCI 2023"},{"id":"http://arxiv.org/abs/2311.15781v1","updated":"2023-11-27T12:54:47Z","published":"2023-11-27T12:54:47Z","title":"Increasing Coverage and Precision of Textual Information in Multilingual\n Knowledge Graphs","summary":" Recent work in Natural Language Processing and Computer Vision has been using\ntextual information -- e.g., entity names and descriptions -- available in\nknowledge graphs to ground neural models to high-quality structured data.\nHowever, when it comes to non-English languages, the quantity and quality of\ntextual information are comparatively scarce. To address this issue, we\nintroduce the novel task of automatic Knowledge Graph Enhancement (KGE) and\nperform a thorough investigation on bridging the gap in both the quantity and\nquality of textual information between English and non-English languages. More\nspecifically, we: i) bring to light the problem of increasing multilingual\ncoverage and precision of entity names and descriptions in Wikidata; ii)\ndemonstrate that state-of-the-art methods, namely, Machine Translation (MT),\nWeb Search (WS), and Large Language Models (LLMs), struggle with this task;\niii) present M-NTA, a novel unsupervised approach that combines MT, WS, and\nLLMs to generate high-quality textual information; and, iv) study the impact of\nincreasing multilingual coverage and precision of non-English textual\ninformation in Entity Linking, Knowledge Graph Completion, and Question\nAnswering. As part of our effort towards better multilingual knowledge graphs,\nwe also introduce WikiKGE-10, the first human-curated benchmark to evaluate KGE\napproaches in 10 languages across 7 language families.\n","authors":["Simone Conia","Min Li","Daniel Lee","Umar Farooq Minhas","Ihab Ilyas","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2311.15781v1.pdf","comment":"Camera ready for EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.15772v1","updated":"2023-11-27T12:44:42Z","published":"2023-11-27T12:44:42Z","title":"Attend Who is Weak: Enhancing Graph Condensation via Cross-Free\n Adversarial Training","summary":" In this paper, we study the \\textit{graph condensation} problem by\ncompressing the large, complex graph into a concise, synthetic representation\nthat preserves the most essential and discriminative information of structure\nand features. We seminally propose the concept of Shock Absorber (a type of\nperturbation) that enhances the robustness and stability of the original graphs\nagainst changes in an adversarial training fashion. Concretely, (I) we forcibly\nmatch the gradients between pre-selected graph neural networks (GNNs) trained\non a synthetic, simplified graph and the original training graph at regularly\nspaced intervals. (II) Before each update synthetic graph point, a Shock\nAbsorber serves as a gradient attacker to maximize the distance between the\nsynthetic dataset and the original graph by selectively perturbing the parts\nthat are underrepresented or insufficiently informative. We iteratively repeat\nthe above two processes (I and II) in an adversarial training fashion to\nmaintain the highly-informative context without losing correlation with the\noriginal dataset. More importantly, our shock absorber and the synthesized\ngraph parallelly share the backward process in a free training manner. Compared\nto the original adversarial training, it introduces almost no additional time\noverhead.\n We validate our framework across 8 datasets (3 graph and 5 node\nclassification datasets) and achieve prominent results: for example, on Cora,\nCiteseer and Ogbn-Arxiv, we can gain nearly 1.13% to 5.03% improvements compare\nwith SOTA models. Moreover, our algorithm adds only about 0.2% to 2.2%\nadditional time overhead over Flicker, Citeseer and Ogbn-Arxiv. Compared to the\ngeneral adversarial training, our approach improves time efficiency by nearly\n4-fold.\n","authors":["Xinglin Li","Kun Wang","Hanhui Deng","Yuxuan Liang","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2311.15772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16883v2","updated":"2023-11-27T12:33:27Z","published":"2023-09-28T22:41:47Z","title":"The Lipschitz-Variance-Margin Tradeoff for Enhanced Randomized Smoothing","summary":" Real-life applications of deep neural networks are hindered by their unsteady\npredictions when faced with noisy inputs and adversarial attacks. The certified\nradius is in this context a crucial indicator of the robustness of models.\nHowever how to design an efficient classifier with a sufficient certified\nradius? Randomized smoothing provides a promising framework by relying on noise\ninjection in inputs to obtain a smoothed and more robust classifier. In this\npaper, we first show that the variance introduced by randomized smoothing\nclosely interacts with two other important properties of the classifier,\n\\textit{i.e.} its Lipschitz constant and margin. More precisely, our work\nemphasizes the dual impact of the Lipschitz constant of the base classifier, on\nboth the smoothed classifier and the empirical variance. Moreover, to increase\nthe certified robust radius, we introduce a different simplex projection\ntechnique for the base classifier to leverage the variance-margin trade-off\nthanks to Bernstein's concentration inequality, along with an enhanced\nLipschitz bound. Experimental results show a significant improvement in\ncertified accuracy compared to current state-of-the-art methods. Our novel\ncertification procedure allows us to use pre-trained models that are used with\nrandomized smoothing, effectively improving the current certification radius in\na zero-shot manner.\n","authors":["Blaise Delattre","Alexandre Araujo","Quentin Barthélemy","Alexandre Allauzen"],"pdf_url":"https://arxiv.org/pdf/2309.16883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15756v1","updated":"2023-11-27T12:22:44Z","published":"2023-11-27T12:22:44Z","title":"Learning Multi-Frequency Partial Correlation Graphs","summary":" Despite the large research effort devoted to learning dependencies between\ntime series, the state of the art still faces a major limitation: existing\nmethods learn partial correlations but fail to discriminate across distinct\nfrequency bands. Motivated by many applications in which this differentiation\nis pivotal, we overcome this limitation by learning a block-sparse,\nfrequency-dependent, partial correlation graph, in which layers correspond to\ndifferent frequency bands, and partial correlations can occur over just a few\nlayers. To this aim, we formulate and solve two nonconvex learning problems:\nthe first has a closed-form solution and is suitable when there is prior\nknowledge about the number of partial correlations; the second hinges on an\niterative solution based on successive convex approximation, and is effective\nfor the general case where no prior knowledge is available. Numerical results\non synthetic data show that the proposed methods outperform the current state\nof the art. Finally, the analysis of financial time series confirms that\npartial correlations exist only within a few frequency bands, underscoring how\nour methods enable the gaining of valuable insights that would be undetected\nwithout discriminating along the frequency domain.\n","authors":["Gabriele D'Acunto","Paolo Di Lorenzo","Francesco Bonchi","Stefania Sardellitti","Sergio Barbarossa"],"pdf_url":"https://arxiv.org/pdf/2311.15756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.12920v3","updated":"2023-11-27T12:18:51Z","published":"2021-02-25T15:18:13Z","title":"Emerging Trends in Federated Learning: From Model Fusion to Federated X\n Learning","summary":" Federated learning is a new learning paradigm that decouples data collection\nand model training via multi-party computation and model aggregation. As a\nflexible learning setting, federated learning has the potential to integrate\nwith other learning frameworks. We conduct a focused survey of federated\nlearning in conjunction with other learning algorithms. Specifically, we\nexplore various learning algorithms to improve the vanilla federated averaging\nalgorithm and review model fusion methods such as adaptive aggregation,\nregularization, clustered methods, and Bayesian methods. Following the emerging\ntrends, we also discuss federated learning in the intersection with other\nlearning paradigms, termed federated X learning, where X includes multitask\nlearning, meta-learning, transfer learning, unsupervised learning, and\nreinforcement learning. This survey reviews the state of the art, challenges,\nand future directions.\n","authors":["Shaoxiong Ji","Yue Tan","Teemu Saravirta","Zhiqin Yang","Lauri Vasankari","Shirui Pan","Guodong Long","Anwar Walid"],"pdf_url":"https://arxiv.org/pdf/2102.12920v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10092v3","updated":"2023-11-27T11:57:36Z","published":"2023-10-16T05:54:30Z","title":"Label Differential Privacy via Aggregation","summary":" In many real-world applications, due to recent developments in the privacy\nlandscape, training data may be aggregated to preserve the privacy of sensitive\ntraining labels. In the learning from label proportions (LLP) framework, the\ndataset is partitioned into bags of feature-vectors which are available only\nwith the sum of the labels per bag. A further restriction, which we call\nlearning from bag aggregates (LBA) is where instead of individual\nfeature-vectors, only the (possibly weighted) sum of the feature-vectors per\nbag is available. We study whether such aggregation techniques can provide\nprivacy guarantees under the notion of label differential privacy (label-DP)\npreviously studied in for e.g. [Chaudhuri-Hsu'11, Ghazi et al.'21, Esfandiari\net al.'22].\n It is easily seen that naive LBA and LLP do not provide label-DP. Our main\nresult however, shows that weighted LBA using iid Gaussian weights with $m$\nrandomly sampled disjoint $k$-sized bags is in fact $(\\varepsilon,\n\\delta)$-label-DP for any $\\varepsilon > 0$ with $\\delta \\approx\n\\exp(-\\Omega(\\sqrt{k}))$ assuming a lower bound on the linear-mse regression\nloss. Further, the $\\ell_2^2$-regressor which minimizes the loss on the\naggregated dataset has a loss within $\\left(1 + o(1)\\right)$-factor of the\noptimum on the original dataset w.p. $\\approx 1 - exp(-\\Omega(m))$. We\nemphasize that no additive label noise is required.\n The analogous weighted-LLP does not however admit label-DP. Nevertheless, we\nshow that if additive $N(0, 1)$ noise can be added to any constant fraction of\nthe instance labels, then the noisy weighted-LLP admits similar label-DP\nguarantees without assumptions on the dataset, while preserving the utility of\nLipschitz-bounded neural mse-regression tasks.\n Our work is the first to demonstrate that label-DP can be achieved by\nrandomly weighted aggregation for regression tasks, using no or little additive\nnoise.\n","authors":["Anand Brahmbhatt","Rishi Saket","Shreyas Havaldar","Anshul Nasery","Aravindan Raghuveer"],"pdf_url":"https://arxiv.org/pdf/2310.10092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07983v2","updated":"2023-11-27T11:54:56Z","published":"2023-09-14T18:40:28Z","title":"SLMIA-SR: Speaker-Level Membership Inference Attacks against Speaker\n Recognition Systems","summary":" Membership inference attacks allow adversaries to determine whether a\nparticular example was contained in the model's training dataset. While\nprevious works have confirmed the feasibility of such attacks in various\napplications, none has focused on speaker recognition (SR), a promising\nvoice-based biometric recognition technique. In this work, we propose SLMIA-SR,\nthe first membership inference attack tailored to SR. In contrast to\nconventional example-level attack, our attack features speaker-level membership\ninference, i.e., determining if any voices of a given speaker, either the same\nas or different from the given inference voices, have been involved in the\ntraining of a model. It is particularly useful and practical since the training\nand inference voices are usually distinct, and it is also meaningful\nconsidering the open-set nature of SR, namely, the recognition speakers were\noften not present in the training data. We utilize intra-similarity and\ninter-dissimilarity, two training objectives of SR, to characterize the\ndifferences between training and non-training speakers and quantify them with\ntwo groups of features driven by carefully-established feature engineering to\nmount the attack. To improve the generalizability of our attack, we propose a\nnovel mixing ratio training strategy to train attack models. To enhance the\nattack performance, we introduce voice chunk splitting to cope with the limited\nnumber of inference voices and propose to train attack models dependent on the\nnumber of inference voices. Our attack is versatile and can work in both\nwhite-box and black-box scenarios. Additionally, we propose two novel\ntechniques to reduce the number of black-box queries while maintaining the\nattack performance. Extensive experiments demonstrate the effectiveness of\nSLMIA-SR.\n","authors":["Guangke Chen","Yedi Zhang","Fu Song"],"pdf_url":"https://arxiv.org/pdf/2309.07983v2.pdf","comment":"In Proceedings of the 31st Network and Distributed System Security\n (NDSS) Symposium, 2024"},{"id":"http://arxiv.org/abs/2310.01144v2","updated":"2023-11-27T11:54:55Z","published":"2023-10-02T12:32:18Z","title":"The Map Equation Goes Neural","summary":" Community detection and graph clustering are essential for unsupervised data\nexploration and understanding the high-level organisation of networked systems.\nRecently, graph clustering has received attention as a primary task for graph\nneural networks. Although hierarchical graph pooling has been shown to improve\nperformance in graph and node classification tasks, it performs poorly in\nidentifying meaningful clusters. Community detection has a long history in\nnetwork science, but typically relies on optimising objective functions with\ncustom-tailored search algorithms, not leveraging recent advances in deep\nlearning, particularly from graph neural networks. In this paper, we narrow\nthis gap between the deep learning and network science communities. We consider\nthe map equation, an information-theoretic objective function for unsupervised\ncommunity detection. Expressing it in a fully differentiable tensor form that\nproduces soft cluster assignments, we optimise the map equation with deep\nlearning through gradient descent. More specifically, the reformulated map\nequation is a loss function compatible with any graph neural network\narchitecture, enabling flexible clustering and graph pooling that clusters both\ngraph structure and data features in an end-to-end way, automatically finding\nan optimum number of clusters without explicit regularisation by following the\nminimum description length principle. We evaluate our approach experimentally\nusing different neural network architectures for unsupervised clustering in\nsynthetic and real data. Our results show that our approach achieves\ncompetitive performance against baselines, naturally detects overlapping\ncommunities, and avoids over-partitioning sparse graphs.\n","authors":["Christopher Blöcker","Chester Tan","Ingo Scholtes"],"pdf_url":"https://arxiv.org/pdf/2310.01144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17005v2","updated":"2023-11-27T11:43:53Z","published":"2023-05-26T15:04:06Z","title":"Aggregating Capacity in FL through Successive Layer Training for\n Computationally-Constrained Devices","summary":" Federated learning (FL) is usually performed on resource-constrained edge\ndevices, e.g., with limited memory for the computation. If the required memory\nto train a model exceeds this limit, the device will be excluded from the\ntraining. This can lead to a lower accuracy as valuable data and computation\nresources are excluded from training, also causing bias and unfairness. The FL\ntraining process should be adjusted to such constraints. The state-of-the-art\ntechniques propose training subsets of the FL model at constrained devices,\nreducing their resource requirements for training. But these techniques largely\nlimit the co-adaptation among parameters of the model and are highly\ninefficient, as we show: it is actually better to train a smaller (less\naccurate) model by the system where all the devices can train the model\nend-to-end, than applying such techniques. We propose a new method that enables\nsuccessive freezing and training of the parameters of the FL model at devices,\nreducing the training's resource requirements at the devices, while still\nallowing enough co-adaptation between parameters. We show through extensive\nexperimental evaluation that our technique greatly improves the accuracy of the\ntrained model (by 52.4 p.p.) compared with the state of the art, efficiently\naggregating the computation capacity available on distributed devices.\n","authors":["Kilian Pfeiffer","Ramin Khalili","Jörg Henkel"],"pdf_url":"https://arxiv.org/pdf/2305.17005v2.pdf","comment":"accepted at NeurIPS'23"},{"id":"http://arxiv.org/abs/2311.12530v2","updated":"2023-11-27T11:28:21Z","published":"2023-11-21T11:21:53Z","title":"An efficient likelihood-free Bayesian inference method based on\n sequential neural posterior estimation","summary":" Sequential neural posterior estimation (SNPE) techniques have been recently\nproposed for dealing with simulation-based models with intractable likelihoods.\nUnlike approximate Bayesian computation, SNPE techniques learn the posterior\nfrom sequential simulation using neural network-based conditional density\nestimators by minimizing a specific loss function. The SNPE method proposed by\nLueckmann et al. (2017) used a calibration kernel to boost the sample weights\naround the observed data, resulting in a concentrated loss function. However,\nthe use of calibration kernels may increase the variances of both the empirical\nloss and its gradient, making the training inefficient. To improve the\nstability of SNPE, this paper proposes to use an adaptive calibration kernel\nand several variance reduction techniques. The proposed method greatly speeds\nup the process of training, and provides a better approximation of the\nposterior than the original SNPE method and some existing competitors as\nconfirmed by numerical experiments.\n","authors":["Yifei Xiong","Xiliang Yang","Sanguo Zhang","Zhijian He"],"pdf_url":"https://arxiv.org/pdf/2311.12530v2.pdf","comment":"30 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.15728v1","updated":"2023-11-27T11:26:41Z","published":"2023-11-27T11:26:41Z","title":"Adinkra Symbol Recognition using Classical Machine Learning and Deep\n Learning","summary":" Artificial intelligence (AI) has emerged as a transformative influence,\nengendering paradigm shifts in global societies, spanning academia and\nindustry. However, in light of these rapid advances, addressing the\nunderrepresentation of black communities and African countries in AI is\ncrucial. Boosting enthusiasm for AI can be effectively accomplished by\nshowcasing straightforward applications around tasks like identifying and\ncategorizing traditional symbols, such as Adinkra symbols, or familiar objects\nwithin the community. In this research endeavor, we dived into classical\nmachine learning and harnessed the power of deep learning models to tackle the\nintricate task of classifying and recognizing Adinkra symbols. The idea led to\na newly constructed ADINKRA dataset comprising 174,338 images meticulously\norganized into 62 distinct classes, each representing a singular and emblematic\nsymbol. We constructed a CNN model for classification and recognition using six\nconvolutional layers, three fully connected (FC) layers, and optional dropout\nregularization. The model is a simpler and smaller version of VGG, with fewer\nlayers, smaller channel sizes, and a fixed kernel size. Additionally, we tap\ninto the transfer learning capabilities provided by pre-trained models like VGG\nand ResNet. These models assist us in both classifying images and extracting\nfeatures that can be used with classical machine learning models. We assess the\nmodel's performance by measuring its accuracy and convergence rate and\nvisualizing the areas that significantly influence its predictions. These\nevaluations serve as a foundational benchmark for future assessments of the\nADINKRA dataset. We hope this application exemplar inspires ideas on the\nvarious uses of AI in organizing our traditional and modern lives.\n","authors":["Michael Adjeisah","Kwame Omono Asamoah","Martha Asamoah Yeboah","Raji Rafiu King","Godwin Ferguson Achaab","Kingsley Adjei"],"pdf_url":"https://arxiv.org/pdf/2311.15728v1.pdf","comment":"15 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.15722v1","updated":"2023-11-27T11:17:20Z","published":"2023-11-27T11:17:20Z","title":"GLIME: General, Stable and Local LIME Explanation","summary":" As black-box machine learning models grow in complexity and find applications\nin high-stakes scenarios, it is imperative to provide explanations for their\npredictions. Although Local Interpretable Model-agnostic Explanations (LIME)\n[22] is a widely adpoted method for understanding model behaviors, it is\nunstable with respect to random seeds [35,24,3] and exhibits low local fidelity\n(i.e., how well the explanation approximates the model's local behaviors)\n[21,16]. Our study shows that this instability problem stems from small sample\nweights, leading to the dominance of regularization and slow convergence.\nAdditionally, LIME's sampling neighborhood is non-local and biased towards the\nreference, resulting in poor local fidelity and sensitivity to reference\nchoice. To tackle these challenges, we introduce GLIME, an enhanced framework\nextending LIME and unifying several prior methods. Within the GLIME framework,\nwe derive an equivalent formulation of LIME that achieves significantly faster\nconvergence and improved stability. By employing a local and unbiased sampling\ndistribution, GLIME generates explanations with higher local fidelity compared\nto LIME. GLIME explanations are independent of reference choice. Moreover,\nGLIME offers users the flexibility to choose a sampling distribution based on\ntheir specific scenarios.\n","authors":["Zeren Tan","Yang Tian","Jian Li"],"pdf_url":"https://arxiv.org/pdf/2311.15722v1.pdf","comment":"Accepted by NeurIPS 2023 as a Spotlight paper"},{"id":"http://arxiv.org/abs/2311.15719v1","updated":"2023-11-27T11:12:33Z","published":"2023-11-27T11:12:33Z","title":"Variational Autoencoders for Feature Exploration and Malignancy\n Prediction of Lung Lesions","summary":" Lung cancer is responsible for 21% of cancer deaths in the UK and five-year\nsurvival rates are heavily influenced by the stage the cancer was identified\nat. Recent studies have demonstrated the capability of AI methods for accurate\nand early diagnosis of lung cancer from routine scans. However, this evidence\nhas not translated into clinical practice with one barrier being a lack of\ninterpretable models. This study investigates the application Variational\nAutoencoders (VAEs), a type of generative AI model, to lung cancer lesions.\nProposed models were trained on lesions extracted from 3D CT scans in the\nLIDC-IDRI public dataset. Latent vector representations of 2D slices produced\nby the VAEs were explored through clustering to justify their quality and used\nin an MLP classifier model for lung cancer diagnosis, the best model achieved\nstate-of-the-art metrics of AUC 0.98 and 93.1% accuracy. Cluster analysis shows\nthe VAE latent space separates the dataset of malignant and benign lesions\nbased on meaningful feature components including tumour size, shape, patient\nand malignancy class. We also include a comparative analysis of the standard\nGaussian VAE (GVAE) and the more recent Dirichlet VAE (DirVAE), which replaces\nthe prior with a Dirichlet distribution to encourage a more explainable latent\nspace with disentangled feature representation. Finally, we demonstrate the\npotential for latent space traversals corresponding to clinically meaningful\nfeature changes.\n","authors":["Benjamin Keel","Aaron Quyn","David Jayne","Samuel D. Relton"],"pdf_url":"https://arxiv.org/pdf/2311.15719v1.pdf","comment":"10 pages (main paper), 5 pages (references), 5 figures, 2 tables,\n work accepted for BMVC 2023"},{"id":"http://arxiv.org/abs/2311.15703v1","updated":"2023-11-27T10:41:28Z","published":"2023-11-27T10:41:28Z","title":"Tabular Two-Dimensional Correlation Analysis for Multifaceted\n Characterization Data","summary":" We propose tabular two-dimensional correlation analysis for extracting\nfeatures from multifaceted characterization data, essential for understanding\nmaterial properties. This method visualizes similarities and phase lags in\nstructural parameter changes through heatmaps, combining hierarchical\nclustering and asynchronous correlations. We applied the proposed method to\ndatasets of carbon nanotube (CNTs) films annealed at various temperatures and\nrevealed the complexity of their hierarchical structures, which include\nelements like voids, bundles, and amorphous carbon. Our analysis addresses the\nchallenge of attempting to understand the sequence of structural changes,\nespecially in multifaceted characterization data where 11 structural parameters\nderived from 8 characterization methods interact with complex behavior. The\nresults show how phase lags (asynchronous changes from stimuli) and parameter\nsimilarities can illuminate the sequence of structural changes in materials,\nproviding insights into phenomena like the removal of amorphous carbon and\ngraphitization in annealed CNTs. This approach is beneficial even with limited\ndata and holds promise for a wide range of material analyses, demonstrating its\npotential in elucidating complex material behaviors and properties.\n","authors":["Shun Muroga","Satoshi Yamazaki","Koji Michishio","Hideaki Nakajima","Takahiro Morimoto","Nagayasu Oshima","Kazufumi Kobashi","Toshiya Okazaki"],"pdf_url":"https://arxiv.org/pdf/2311.15703v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.01825v2","updated":"2023-11-27T10:39:13Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":" Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15696v1","updated":"2023-11-27T10:32:31Z","published":"2023-11-27T10:32:31Z","title":"Peptide Binding Classification on Quantum Computers","summary":" We conduct an extensive study on using near-term quantum computers for a task\nin the domain of computational biology. By constructing quantum models based on\nparameterised quantum circuits we perform sequence classification on a task\nrelevant to the design of therapeutic proteins, and find competitive\nperformance with classical baselines of similar scale. To study the effect of\nnoise, we run some of the best-performing quantum models with favourable\nresource requirements on emulators of state-of-the-art noisy quantum\nprocessors. We then apply error mitigation methods to improve the signal. We\nfurther execute these quantum models on the Quantinuum H1-1 trapped-ion quantum\nprocessor and observe very close agreement with noiseless exact simulation.\nFinally, we perform feature attribution methods and find that the quantum\nmodels indeed identify sensible relationships, at least as well as the\nclassical baselines. This work constitutes the first proof-of-concept\napplication of near-term quantum computing to a task critical to the design of\ntherapeutic proteins, opening the route toward larger-scale applications in\nthis and related fields, in line with the hardware development roadmaps of\nnear-term quantum technologies.\n","authors":["Charles London","Douglas Brown","Wenduan Xu","Sezen Vatansever","Christopher James Langmead","Dimitri Kartsaklis","Stephen Clark","Konstantinos Meichanetzidis"],"pdf_url":"https://arxiv.org/pdf/2311.15696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15691v1","updated":"2023-11-27T10:28:44Z","published":"2023-11-27T10:28:44Z","title":"Automated discovery of trade-off between utility, privacy and fairness\n in machine learning models","summary":" Machine learning models are deployed as a central component in decision\nmaking and policy operations with direct impact on individuals' lives. In order\nto act ethically and comply with government regulations, these models need to\nmake fair decisions and protect the users' privacy. However, such requirements\ncan come with decrease in models' performance compared to their potentially\nbiased, privacy-leaking counterparts. Thus the trade-off between fairness,\nprivacy and performance of ML models emerges, and practitioners need a way of\nquantifying this trade-off to enable deployment decisions. In this work we\ninterpret this trade-off as a multi-objective optimization problem, and propose\nPFairDP, a pipeline that uses Bayesian optimization for discovery of\nPareto-optimal points between fairness, privacy and utility of ML models. We\nshow how PFairDP can be used to replicate known results that were achieved\nthrough manual constraint setting process. We further demonstrate effectiveness\nof PFairDP with experiments on multiple models and datasets.\n","authors":["Bogdan Ficiu","Neil D. Lawrence","Andrei Paleyes"],"pdf_url":"https://arxiv.org/pdf/2311.15691v1.pdf","comment":"3rd Workshop on Bias and Fairness in AI (BIAS), ECML 2023"},{"id":"http://arxiv.org/abs/2306.07294v2","updated":"2023-11-27T10:21:24Z","published":"2023-06-10T11:25:31Z","title":"Computational and Storage Efficient Quadratic Neurons for Deep Neural\n Networks","summary":" Deep neural networks (DNNs) have been widely deployed across diverse domains\nsuch as computer vision and natural language processing. However, the\nimpressive accomplishments of DNNs have been realized alongside extensive\ncomputational demands, thereby impeding their applicability on\nresource-constrained devices. To address this challenge, many researchers have\nbeen focusing on basic neuron structures, the fundamental building blocks of\nneural networks, to alleviate the computational and storage cost. In this work,\nan efficient quadratic neuron architecture distinguished by its enhanced\nutilization of second-order computational information is introduced. By virtue\nof their better expressivity, DNNs employing the proposed quadratic neurons can\nattain similar accuracy with fewer neurons and computational cost. Experimental\nresults have demonstrated that the proposed quadratic neuron structure exhibits\nsuperior computational and storage efficiency across various tasks when\ncompared with both linear and non-linear neurons in prior work.\n","authors":["Chuangtao Chen","Grace Li Zhang","Xunzhao Yin","Cheng Zhuo","Ulf Schlichtmann","Bing Li"],"pdf_url":"https://arxiv.org/pdf/2306.07294v2.pdf","comment":"Accepted by Design Automation and Test in Europe (DATE) 2024"},{"id":"http://arxiv.org/abs/2311.15685v1","updated":"2023-11-27T10:18:17Z","published":"2023-11-27T10:18:17Z","title":"The Battleship Approach to the Low Resource Entity Matching Problem","summary":" Entity matching, a core data integration problem, is the task of deciding\nwhether two data tuples refer to the same real-world entity. Recent advances in\ndeep learning methods, using pre-trained language models, were proposed for\nresolving entity matching. Although demonstrating unprecedented results, these\nsolutions suffer from a major drawback as they require large amounts of labeled\ndata for training, and, as such, are inadequate to be applied to low resource\nentity matching problems. To overcome the challenge of obtaining sufficient\nlabeled data we offer a new active learning approach, focusing on a selection\nmechanism that exploits unique properties of entity matching. We argue that a\ndistributed representation of a tuple pair indicates its informativeness when\nconsidered among other pairs. This is used consequently in our approach that\niteratively utilizes space-aware considerations. Bringing it all together, we\ntreat the low resource entity matching problem as a Battleship game, hunting\nindicative samples, focusing on positive ones, through awareness of the latent\nspace along with careful planning of next sampling iterations. An extensive\nexperimental analysis shows that the proposed algorithm outperforms\nstate-of-the-art active learning solutions to low resource entity matching, and\nalthough using less samples, can be as successful as state-of-the-art fully\ntrained known algorithms.\n","authors":["Bar Genossar","Avigdor Gal","Roee Shraga"],"pdf_url":"https://arxiv.org/pdf/2311.15685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15682v1","updated":"2023-11-27T10:16:22Z","published":"2023-11-27T10:16:22Z","title":"Information theoretic study of the neural geometry induced by category\n learning","summary":" Categorization is an important topic both for biological and artificial\nneural networks. Here, we take an information theoretic approach to assess the\nefficiency of the representations induced by category learning. We show that\none can decompose the relevant Bayesian cost into two components, one for the\ncoding part and one for the decoding part. Minimizing the coding cost implies\nmaximizing the mutual information between the set of categories and the neural\nactivities. We analytically show that this mutual information can be written as\nthe sum of two terms that can be interpreted as (i) finding an appropriate\nrepresentation space, and, (ii) building a representation with the appropriate\nmetrics, based on the neural Fisher information on this space. One main\nconsequence is that category learning induces an expansion of neural space near\ndecision boundaries. Finally, we provide numerical illustrations that show how\nFisher information of the coding neural population aligns with the boundaries\nbetween categories.\n","authors":["Laurent Bonnasse-Gahot","Jean-Pierre Nadal"],"pdf_url":"https://arxiv.org/pdf/2311.15682v1.pdf","comment":"7 pages, 2 figures, Accepted (Oral) to InfoCog@NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.15673v1","updated":"2023-11-27T10:02:12Z","published":"2023-11-27T10:02:12Z","title":"Accelerating Hierarchical Associative Memory: A Deep Equilibrium\n Approach","summary":" Hierarchical Associative Memory models have recently been proposed as a\nversatile extension of continuous Hopfield networks. In order to facilitate\nfuture research on such models, especially at scale, we focus on increasing\ntheir simulation efficiency on digital hardware. In particular, we propose two\nstrategies to speed up memory retrieval in these models, which corresponds to\ntheir use at inference, but is equally important during training. First, we\nshow how they can be cast as Deep Equilibrium Models, which allows using faster\nand more stable solvers. Second, inspired by earlier work, we show that\nalternating optimization of the even and odd layers accelerates memory\nretrieval by a factor close to two. Combined, these two techniques allow for a\nmuch faster energy minimization, as shown in our proof-of-concept experimental\nresults. The code is available at https://github.com/cgoemaere/hamdeq\n","authors":["Cédric Goemaere","Johannes Deleu","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2311.15673v1.pdf","comment":"Accepted at the \"Associative Memory & Hopfield Networks'' workshop at\n NeurIPS, 2023"},{"id":"http://arxiv.org/abs/2311.13959v2","updated":"2023-11-27T09:47:10Z","published":"2023-11-23T12:17:45Z","title":"RankFeat&RankWeight: Rank-1 Feature/Weight Removal for\n Out-of-distribution Detection","summary":" The task of out-of-distribution (OOD) detection is crucial for deploying\nmachine learning models in real-world settings. In this paper, we observe that\nthe singular value distributions of the in-distribution (ID) and OOD features\nare quite different: the OOD feature matrix tends to have a larger dominant\nsingular value than the ID feature, and the class predictions of OOD samples\nare largely determined by it. This observation motivates us to propose\n\\texttt{RankFeat}, a simple yet effective \\emph{post hoc} approach for OOD\ndetection by removing the rank-1 matrix composed of the largest singular value\nand the associated singular vectors from the high-level feature.\n\\texttt{RankFeat} achieves \\emph{state-of-the-art} performance and reduces the\naverage false positive rate (FPR95) by 17.90\\% compared with the previous best\nmethod. The success of \\texttt{RankFeat} motivates us to investigate whether a\nsimilar phenomenon would exist in the parameter matrices of neural networks. We\nthus propose \\texttt{RankWeight} which removes the rank-1 weight from the\nparameter matrices of a single deep layer. Our \\texttt{RankWeight}is also\n\\emph{post hoc} and only requires computing the rank-1 matrix once. As a\nstandalone approach, \\texttt{RankWeight} has very competitive performance\nagainst other methods across various backbones. Moreover, \\texttt{RankWeight}\nenjoys flexible compatibility with a wide range of OOD detection methods. The\ncombination of \\texttt{RankWeight} and \\texttt{RankFeat} refreshes the new\n\\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\\% on\nthe ImageNet-1k benchmark. Extensive ablation studies and comprehensive\ntheoretical analyses are presented to support the empirical results.\n","authors":["Yue Song","Nicu Sebe","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13959v2.pdf","comment":"submitted to T-PAMI. arXiv admin note: substantial text overlap with\n arXiv:2209.08590"},{"id":"http://arxiv.org/abs/2311.13265v2","updated":"2023-11-27T09:40:19Z","published":"2023-11-22T09:31:19Z","title":"Improved identification accuracy in equation learning via comprehensive\n $\\boldsymbol{R^2}$-elimination and Bayesian model selection","summary":" In the field of equation learning, exhaustively considering all possible\nequations derived from a basis function dictionary is infeasible. Sparse\nregression and greedy algorithms have emerged as popular approaches to tackle\nthis challenge. However, the presence of multicollinearity poses difficulties\nfor sparse regression techniques, and greedy steps may inadvertently exclude\nterms of the true equation, leading to reduced identification accuracy. In this\narticle, we present an approach that strikes a balance between\ncomprehensiveness and efficiency in equation learning. Inspired by stepwise\nregression, our approach combines the coefficient of determination, $R^2$, and\nthe Bayesian model evidence, $p(\\boldsymbol y|\\mathcal M)$, in a novel way. Our\nprocedure is characterized by a comprehensive search with just a minor\nreduction of the model space at each iteration step. With two flavors of our\napproach and the adoption of $p(\\boldsymbol y|\\mathcal M)$ for bi-directional\nstepwise regression, we present a total of three new avenues for equation\nlearning. Through three extensive numerical experiments involving random\npolynomials and dynamical systems, we compare our approach against four\nstate-of-the-art methods and two standard approaches. The results demonstrate\nthat our comprehensive search approach surpasses all other methods in terms of\nidentification accuracy. In particular, the second flavor of our approach\nestablishes an efficient overfitting penalty solely based on $R^2$, which\nachieves highest rates of exact equation recovery.\n","authors":["Daniel Nickelsen","Bubacarr Bah"],"pdf_url":"https://arxiv.org/pdf/2311.13265v2.pdf","comment":"12 pages main text and 11 pages appendix, Published in TMLR\n (https://openreview.net/forum?id=0ck7hJ8EVC)"},{"id":"http://arxiv.org/abs/2311.15658v1","updated":"2023-11-27T09:40:14Z","published":"2023-11-27T09:40:14Z","title":"Regularization by Texts for Latent Diffusion Inverse Solvers","summary":" The recent advent of diffusion models has led to significant progress in\nsolving inverse problems, leveraging these models as effective generative\npriors. Nonetheless, challenges related to the ill-posed nature of such\nproblems remain, often due to inherent ambiguities in measurements. Drawing\ninspiration from the human ability to resolve visual ambiguities through\nperceptual biases, here we introduce a novel latent diffusion inverse solver by\nincorporating regularization by texts (TReg). Specifically, TReg applies the\ntextual description of the preconception of the solution during the reverse\nsampling phase, of which description isndynamically reinforced through\nnull-text optimization for adaptive negation. Our comprehensive experimental\nresults demonstrate that TReg successfully mitigates ambiguity in latent\ndiffusion inverse solvers, enhancing their effectiveness and accuracy.\n","authors":["Jeongsol Kim","Geon Yeong Park","Hyungjin Chung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15654v1","updated":"2023-11-27T09:33:56Z","published":"2023-11-27T09:33:56Z","title":"Universal Event Detection in Time Series","summary":" In our previously published work, we introduced a supervised deep learning\nmethod for event detection in multivariate time series data, employing\nregression instead of binary classification. This simplification avoids the\nneed for point-wise labels throughout the entire dataset, relying solely on\nground truth events defined as time points or intervals. In this paper, we\nestablish mathematically that our method is universal, and capable of detecting\nany type of event with arbitrary precision under mild continuity assumptions on\nthe time series. These events may encompass change points, frauds, anomalies,\nphysical occurrences, and more. We substantiate our theoretical results using\nthe universal approximation theorem for feed-forward neural networks (FFN).\nAdditionally, we provide empirical validations that confirm our claims,\ndemonstrating that our method, with a limited number of parameters, outperforms\nother deep learning approaches, particularly for rare events and imbalanced\ndatasets from different domains.\n","authors":["Menouar Azib","Benjamin Renard","Philippe Garnier","Vincent Génot","Nicolas André"],"pdf_url":"https://arxiv.org/pdf/2311.15654v1.pdf","comment":"To be submitted to IEEE Transactions on Neural Networks and Learning\n Systems"},{"id":"http://arxiv.org/abs/2311.15649v1","updated":"2023-11-27T09:20:23Z","published":"2023-11-27T09:20:23Z","title":"RoboGPT: an intelligent agent of making embodied long-term decisions for\n daily instruction tasks","summary":" Robotic agents must master common sense and long-term sequential decisions to\nsolve daily tasks through natural language instruction. The developments in\nLarge Language Models (LLMs) in natural language processing have inspired\nefforts to use LLMs in complex robot planning. Despite LLMs' great\ngeneralization and comprehension of instruction tasks, LLMs-generated task\nplans sometimes lack feasibility and correctness. To address the problem, we\npropose a RoboGPT agent\\footnote{our code and dataset will be released soon}\nfor making embodied long-term decisions for daily tasks, with two modules: 1)\nLLMs-based planning with re-plan to break the task into multiple sub-goals; 2)\nRoboSkill individually designed for sub-goals to learn better navigation and\nmanipulation skills. The LLMs-based planning is enhanced with a new robotic\ndataset and re-plan, called RoboGPT. The new robotic dataset of 67k daily\ninstruction tasks is gathered for fine-tuning the Llama model and obtaining\nRoboGPT. RoboGPT planner with strong generalization can plan hundreds of daily\ninstruction tasks. Additionally, a low-computational Re-Plan module is designed\nto allow plans to flexibly adapt to the environment, thereby addressing the\nnomenclature diversity challenge. The proposed RoboGPT agent outperforms SOTA\nmethods on the ALFRED daily tasks. Moreover, RoboGPT planner exceeds SOTA\nLLM-based planners like ChatGPT in task-planning rationality for hundreds of\nunseen daily tasks, and even other domain tasks, while keeping the large\nmodel's original broad application and generality.\n","authors":["Yaran Chen","Wenbo Cui","Yuanwen Chen","Mining Tan","Xinyao Zhang","Dongbin Zhao","He Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15648v1","updated":"2023-11-27T09:20:12Z","published":"2023-11-27T09:20:12Z","title":"Reinforcement Learning from Diffusion Feedback: Q* for Image Search","summary":" Large vision-language models are steadily gaining personalization\ncapabilities at the cost of fine-tuning or data augmentation. We present two\nmodels for image generation using model-agnostic learning that align semantic\npriors with generative capabilities. RLDF, or Reinforcement Learning from\nDiffusion Feedback, is a singular approach for visual imitation through\nprior-preserving reward function guidance. This employs Q-learning (with\nstandard Q*) for generation and follows a semantic-rewarded trajectory for\nimage search through finite encoding-tailored actions. The second proposed\nmethod, noisy diffusion gradient, is optimization driven. At the root of both\nmethods is a special CFG encoding that we propose for continual semantic\nguidance. Using only a single input image and no text input, RLDF generates\nhigh-quality images over varied domains including retail, sports and\nagriculture showcasing class-consistency and strong visual diversity. Project\nwebsite is available at https://infernolia.github.io/RLDF.\n","authors":["Aboli Marathe"],"pdf_url":"https://arxiv.org/pdf/2311.15648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15647v1","updated":"2023-11-27T09:19:01Z","published":"2023-11-27T09:19:01Z","title":"Bandits Meet Mechanism Design to Combat Clickbait in Online\n Recommendation","summary":" We study a strategic variant of the multi-armed bandit problem, which we coin\nthe strategic click-bandit. This model is motivated by applications in online\nrecommendation where the choice of recommended items depends on both the\nclick-through rates and the post-click rewards. Like in classical bandits,\nrewards follow a fixed unknown distribution. However, we assume that the\nclick-rate of each arm is chosen strategically by the arm (e.g., a host on\nAirbnb) in order to maximize the number of times it gets clicked. The algorithm\ndesigner does not know the post-click rewards nor the arms' actions (i.e.,\nstrategically chosen click-rates) in advance, and must learn both values over\ntime. To solve this problem, we design an incentive-aware learning algorithm,\nUCB-S, which achieves two goals simultaneously: (a) incentivizing desirable arm\nbehavior under uncertainty; (b) minimizing regret by learning unknown\nparameters. We characterize all approximate Nash equilibria among arms under\nUCB-S and show a $\\tilde{\\mathcal{O}} (\\sqrt{KT})$ regret bound uniformly in\nevery equilibrium. We also show that incentive-unaware algorithms generally\nfail to achieve low regret in the strategic click-bandit. Finally, we support\nour theoretical results by simulations of strategic arm behavior which confirm\nthe effectiveness and robustness of our proposed incentive design.\n","authors":["Thomas Kleine Buening","Aadirupa Saha","Christos Dimitrakakis","Haifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2311.15647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14969v2","updated":"2023-11-27T09:06:55Z","published":"2023-08-29T01:47:49Z","title":"Uncovering the Hidden Cost of Model Compression","summary":" In the era of resource-intensive foundation models, efficient adaptation in\ndownstream tasks has become paramount. Visual Prompting (VP), inspired by\nprompting in Large Language Models (LLMs), has emerged as a key transfer\nlearning method in computer vision. Aligned with the growing significance of\nefficiency, research in model compression has become pivotal to alleviate the\ncomputational burden in both training and deploying over-parameterized neural\nnetworks. A key goal in model compression is the development of sparse models\ncapable of matching or surpassing the performance of their over-parameterized,\ndense counterparts. While prior research has explored the impact of model\nsparsity on transfer learning, its effects on visual prompting-based transfer\nremain unclear. This study addresses this gap, revealing that model sparsity\nadversely affects the performance of visual prompting-based transfer,\nparticularly in low-data-volume scenarios. Furthermore, our findings highlight\nthe negative influence of sparsity on the calibration of downstream\nvisual-prompted models. This empirical exploration calls for a nuanced\nunderstanding beyond accuracy in sparse settings, opening avenues for further\nresearch in Visual Prompting for sparse models. Code and logs can be accessed\nat https://github.com/landskape-ai/Reprogram_LT .\n","authors":["Diganta Misra","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2311.15623v1","updated":"2023-11-27T08:38:42Z","published":"2023-11-27T08:38:42Z","title":"Injecting linguistic knowledge into BERT for Dialogue State Tracking","summary":" Dialogue State Tracking (DST) models often employ intricate neural network\narchitectures, necessitating substantial training data, and their inference\nprocesses lack transparency. This paper proposes a method that extracts\nlinguistic knowledge via an unsupervised framework and subsequently utilizes\nthis knowledge to augment BERT's performance and interpretability in DST tasks.\nThe knowledge extraction procedure is computationally economical and does not\nnecessitate annotations or additional training data. The injection of the\nextracted knowledge necessitates the addition of only simple neural modules. We\nemploy the Convex Polytopic Model (CPM) as a feature extraction tool for DST\ntasks and illustrate that the acquired features correlate with the syntactic\nand semantic patterns in the dialogues. This correlation facilitates a\ncomprehensive understanding of the linguistic features influencing the DST\nmodel's decision-making process. We benchmark this framework on various DST\ntasks and observe a notable improvement in accuracy.\n","authors":["Xiaohan Feng","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2311.15623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15617v1","updated":"2023-11-27T08:28:08Z","published":"2023-11-27T08:28:08Z","title":"VeryFL: A Verify Federated Learning Framework Embedded with Blockchain","summary":" Blockchain-empowered federated learning (FL) has provoked extensive research\nrecently. Various blockchain-based federated learning algorithm, architecture\nand mechanism have been designed to solve issues like single point failure and\ndata falsification brought by centralized FL paradigm. Moreover, it is easier\nto allocate incentives to nodes with the help of the blockchain. Various\ncentralized federated learning frameworks like FedML, have emerged in the\ncommunity to help boost the research on FL. However, decentralized\nblockchain-based federated learning framework is still missing, which cause\ninconvenience for researcher to reproduce or verify the algorithm performance\nbased on blockchain. Inspired by the above issues, we have designed and\ndeveloped a blockchain-based federated learning framework by embedding Ethereum\nnetwork. This report will present the overall structure of this framework,\nwhich proposes a code practice paradigm for the combination of FL with\nblockchain and, at the same time, compatible with normal FL training task. In\naddition to implement some blockchain federated learning algorithms on smart\ncontract to help execute a FL training, we also propose a model ownership\nauthentication architecture based on blockchain and model watermarking to\nprotect the intellectual property rights of models. These mechanism on\nblockchain shows an underlying support of blockchain for federated learning to\nprovide a verifiable training, aggregation and incentive distribution procedure\nand thus we named this framework VeryFL (A Verify Federated Learninig Framework\nEmbedded with Blockchain). The source code is avaliable on\nhttps://github.com/GTMLLab/VeryFL.\n","authors":["Yihao Li","Yanyi Lai","Chuan Chen","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2311.15617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15610v1","updated":"2023-11-27T08:10:53Z","published":"2023-11-27T08:10:53Z","title":"Bayesian Approach to Linear Bayesian Networks","summary":" This study proposes the first Bayesian approach for learning high-dimensional\nlinear Bayesian networks. The proposed approach iteratively estimates each\nelement of the topological ordering from backward and its parent using the\ninverse of a partial covariance matrix. The proposed method successfully\nrecovers the underlying structure when Bayesian regularization for the inverse\ncovariance matrix with unequal shrinkage is applied. Specifically, it shows\nthat the number of samples $n = \\Omega( d_M^2 \\log p)$ and $n = \\Omega(d_M^2\np^{2/m})$ are sufficient for the proposed algorithm to learn linear Bayesian\nnetworks with sub-Gaussian and 4m-th bounded-moment error distributions,\nrespectively, where $p$ is the number of nodes and $d_M$ is the maximum degree\nof the moralized graph. The theoretical findings are supported by extensive\nsimulation studies including real data analysis. Furthermore the proposed\nmethod is demonstrated to outperform state-of-the-art frequentist approaches,\nsuch as the BHLSM, LISTEN, and TD algorithms in synthetic data.\n","authors":["Seyong Hwang","Kyoungjae Lee","Sunmin Oh","Gunwoong Park"],"pdf_url":"https://arxiv.org/pdf/2311.15610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.03543v2","updated":"2023-11-27T08:09:20Z","published":"2021-03-05T08:45:43Z","title":"Artificial Neural Networks generated by Low Discrepancy Sequences","summary":" Artificial neural networks can be represented by paths. Generated as random\nwalks on a dense network graph, we find that the resulting sparse networks\nallow for deterministic initialization and even weights with fixed sign. Such\nnetworks can be trained sparse from scratch, avoiding the expensive procedure\nof training a dense network and compressing it afterwards. Although sparse,\nweights are accessed as contiguous blocks of memory. In addition, enumerating\nthe paths using deterministic low discrepancy sequences, for example the Sobol'\nsequence, amounts to connecting the layers of neural units by progressive\npermutations, which naturally avoids bank conflicts in parallel computer\nhardware. We demonstrate that the artificial neural networks generated by low\ndiscrepancy sequences can achieve an accuracy within reach of their dense\ncounterparts at a much lower computational complexity.\n","authors":["Alexander Keller","Matthijs Van keirsbilck"],"pdf_url":"https://arxiv.org/pdf/2103.03543v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15609v1","updated":"2023-11-27T08:06:56Z","published":"2023-11-27T08:06:56Z","title":"A manometric feature descriptor with linear-SVM to distinguish\n esophageal contraction vigor","summary":" n clinical, if a patient presents with nonmechanical obstructive dysphagia,\nesophageal chest pain, and gastro esophageal reflux symptoms, the physician\nwill usually assess the esophageal dynamic function. High-resolution manometry\n(HRM) is a clinically commonly used technique for detection of esophageal\ndynamic function comprehensively and objectively. However, after the results of\nHRM are obtained, doctors still need to evaluate by a variety of parameters.\nThis work is burdensome, and the process is complex. We conducted image\nprocessing of HRM to predict the esophageal contraction vigor for assisting the\nevaluation of esophageal dynamic function. Firstly, we used Feature-Extraction\nand Histogram of Gradients (FE-HOG) to analyses feature of proposal of swallow\n(PoS) to further extract higher-order features. Then we determine the\nclassification of esophageal contraction vigor normal, weak and failed by using\nlinear-SVM according to these features. Our data set includes 3000 training\nsets, 500 validation sets and 411 test sets. After verification our accuracy\nreaches 86.83%, which is higher than other common machine learning methods.\n","authors":["Jialin Liu","Lu Yan","Xiaowei Liu","Yuzhuo Dai","Fanggen Lu","Yuanting Ma","Muzhou Hou","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14468v2","updated":"2023-11-27T08:04:04Z","published":"2023-11-24T13:21:35Z","title":"Efficient Gradient Estimation via Adaptive Sampling and Importance\n Sampling","summary":" Machine learning problems rely heavily on stochastic gradient descent (SGD)\nfor optimization. The effectiveness of SGD is contingent upon accurately\nestimating gradients from a mini-batch of data samples. Instead of the commonly\nused uniform sampling, adaptive or importance sampling reduces noise in\ngradient estimation by forming mini-batches that prioritize crucial data\npoints. Previous research has suggested that data points should be selected\nwith probabilities proportional to their gradient norm. Nevertheless, existing\nalgorithms have struggled to efficiently integrate importance sampling into\nmachine learning frameworks. In this work, we make two contributions. First, we\npresent an algorithm that can incorporate existing importance functions into\nour framework. Second, we propose a simplified importance function that relies\nsolely on the loss gradient of the output layer. By leveraging our proposed\ngradient estimation techniques, we observe improved convergence in\nclassification and regression tasks with minimal computational overhead. We\nvalidate the effectiveness of our adaptive and importance-sampling approach on\nimage and point-cloud datasets.\n","authors":["Corentin Salaün","Xingchang Huang","Iliyan Georgiev","Niloy J. Mitra","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2311.14468v2.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.15603v1","updated":"2023-11-27T07:53:44Z","published":"2023-11-27T07:53:44Z","title":"QuickDrop: Efficient Federated Unlearning by Integrated Dataset\n Distillation","summary":" Federated Unlearning (FU) aims to delete specific training data from an ML\nmodel trained using Federated Learning (FL). We introduce QuickDrop, an\nefficient and original FU method that utilizes dataset distillation (DD) to\naccelerate unlearning and drastically reduces computational overhead compared\nto existing approaches. In QuickDrop, each client uses DD to generate a compact\ndataset representative of the original training dataset, called a distilled\ndataset, and uses this compact dataset during unlearning. To unlearn specific\nknowledge from the global model, QuickDrop has clients execute Stochastic\nGradient Ascent with samples from the distilled datasets, thus significantly\nreducing computational overhead compared to conventional FU methods. We further\nincrease the efficiency of QuickDrop by ingeniously integrating DD into the FL\ntraining process. By reusing the gradient updates produced during FL training\nfor DD, the overhead of creating distilled datasets becomes close to\nnegligible. Evaluations on three standard datasets show that, with comparable\naccuracy guarantees, QuickDrop reduces the duration of unlearning by 463.8x\ncompared to model retraining from scratch and 65.1x compared to existing FU\napproaches. We also demonstrate the scalability of QuickDrop with 100 clients\nand show its effectiveness while handling multiple unlearning operations.\n","authors":["Akash Dhasade","Yaohong Ding","Song Guo","Anne-marie Kermarrec","Martijn De Vos","Leijie Wu"],"pdf_url":"https://arxiv.org/pdf/2311.15603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15598v1","updated":"2023-11-27T07:48:50Z","published":"2023-11-27T07:48:50Z","title":"Optimal Clustering of Discrete Mixtures: Binomial, Poisson, Block\n Models, and Multi-layer Networks","summary":" In this paper, we first study the fundamental limit of clustering networks\nwhen a multi-layer network is present. Under the mixture multi-layer stochastic\nblock model (MMSBM), we show that the minimax optimal network clustering error\nrate, which takes an exponential form and is characterized by the Renyi\ndivergence between the edge probability distributions of the component\nnetworks. We propose a novel two-stage network clustering method including a\ntensor-based initialization algorithm involving both node and sample splitting\nand a refinement procedure by likelihood-based Lloyd algorithm. Network\nclustering must be accompanied by node community detection. Our proposed\nalgorithm achieves the minimax optimal network clustering error rate and allows\nextreme network sparsity under MMSBM. Numerical simulations and real data\nexperiments both validate that our method outperforms existing methods.\nOftentimes, the edges of networks carry count-type weights. We then extend our\nmethodology and analysis framework to study the minimax optimal clustering\nerror rate for mixture of discrete distributions including Binomial, Poisson,\nand multi-layer Poisson networks. The minimax optimal clustering error rates in\nthese discrete mixtures all take the same exponential form characterized by the\nRenyi divergences. These optimal clustering error rates in discrete mixtures\ncan also be achieved by our proposed two-stage clustering algorithm.\n","authors":["Zhongyuan Lyu","Ting Li","Dong Xia"],"pdf_url":"https://arxiv.org/pdf/2311.15598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15599v1","updated":"2023-11-27T07:48:50Z","published":"2023-11-27T07:48:50Z","title":"UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio,\n Video, Point Cloud, Time-Series and Image Recognition","summary":" Large-kernel convolutional neural networks (ConvNets) have recently received\nextensive research attention, but there are two unresolved and critical issues\nthat demand further investigation. 1) The architectures of existing\nlarge-kernel ConvNets largely follow the design principles of conventional\nConvNets or transformers, while the architectural design for large-kernel\nConvNets remains under-addressed. 2) As transformers have dominated multiple\nmodalities, it remains to be investigated whether ConvNets also have a strong\nuniversal perception ability in domains beyond vision. In this paper, we\ncontribute from two aspects. 1) We propose four architectural guidelines for\ndesigning large-kernel ConvNets, the core of which is to exploit the essential\ncharacteristics of large kernels that distinguish them from small kernels -\nthey can see wide without going deep. Following such guidelines, our proposed\nlarge-kernel ConvNet shows leading performance in image recognition. For\nexample, our models achieve an ImageNet accuracy of 88.0%, ADE20K mIoU of\n55.6%, and COCO box AP of 56.4%, demonstrating better performance and higher\nspeed than a number of recently proposed powerful competitors. 2) We discover\nthat large kernels are the key to unlocking the exceptional performance of\nConvNets in domains where they were originally not proficient. With certain\nmodality-related preprocessing approaches, the proposed model achieves\nstate-of-the-art performance on time-series forecasting and audio recognition\ntasks even without modality-specific customization to the architecture. Code\nand all the models at https://github.com/AILab-CVC/UniRepLKNet.\n","authors":["Xiaohan Ding","Yiyuan Zhang","Yixiao Ge","Sijie Zhao","Lin Song","Xiangyu Yue","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2311.15599v1.pdf","comment":"Code, all the models and reproducible training scripts at\n https://github.com/AILab-CVC/UniRepLKNet"},{"id":"http://arxiv.org/abs/2311.12612v3","updated":"2023-11-27T07:41:06Z","published":"2023-11-21T13:54:08Z","title":"A New Type Of Upper And Lower Bounds On Right-Tail Probabilities Of\n Continuous Random Variables","summary":" In this paper, I present a completely new type of upper and lower bounds on\nthe right-tail probabilities of continuous random variables with unbounded\nsupport and with semi-bounded support from the left. The presented upper and\nlower right-tail bounds depend only on the probability density function (PDF),\nits first derivative, and two parameters that are used for tightening the\nbounds. These tail bounds hold under certain conditions that depend on the PDF,\nits first and second derivatives, and the two parameters. The new tail bounds\nare shown to be tight for a wide range of continuous random variables via\nnumerical examples.\n","authors":["Nikola Zlatanov"],"pdf_url":"https://arxiv.org/pdf/2311.12612v3.pdf","comment":"Minor typos corrected v2"},{"id":"http://arxiv.org/abs/2310.20587v4","updated":"2023-11-27T07:38:06Z","published":"2023-10-31T16:24:17Z","title":"Unleashing the Power of Pre-trained Language Models for Offline\n Reinforcement Learning","summary":" Offline reinforcement learning (RL) aims to find a near-optimal policy using\npre-collected datasets. In real-world scenarios, data collection could be\ncostly and risky; therefore, offline RL becomes particularly challenging when\nthe in-domain data is limited. Given recent advances in Large Language Models\n(LLMs) and their few-shot learning prowess, this paper introduces\n$\\textbf{La}$nguage Models for $\\textbf{Mo}$tion Control ($\\textbf{LaMo}$), a\ngeneral framework based on Decision Transformers to effectively use pre-trained\nLanguage Models (LMs) for offline RL. Our framework highlights four crucial\ncomponents: (1) Initializing Decision Transformers with sequentially\npre-trained LMs, (2) employing the LoRA fine-tuning method, in contrast to\nfull-weight fine-tuning, to combine the pre-trained knowledge from LMs and\nin-domain knowledge effectively, (3) using the non-linear MLP transformation\ninstead of linear projections, to generate embeddings, and (4) integrating an\nauxiliary language prediction loss during fine-tuning to stabilize the LMs and\nretain their original abilities on languages. Empirical results indicate\n$\\textbf{LaMo}$ achieves state-of-the-art performance in sparse-reward tasks\nand closes the gap between value-based offline RL methods and decision\ntransformers in dense-reward tasks. In particular, our method demonstrates\nsuperior performance in scenarios with limited data samples.\n","authors":["Ruizhe Shi","Yuyao Liu","Yanjie Ze","Simon S. Du","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2310.20587v4.pdf","comment":"24 pages, 16 tables"},{"id":"http://arxiv.org/abs/2311.15587v1","updated":"2023-11-27T07:25:47Z","published":"2023-11-27T07:25:47Z","title":"Quantum Langevin Dynamics for Optimization","summary":" We initiate the study of utilizing Quantum Langevin Dynamics (QLD) to solve\noptimization problems, particularly those non-convex objective functions that\npresent substantial obstacles for traditional gradient descent algorithms.\nSpecifically, we examine the dynamics of a system coupled with an infinite heat\nbath. This interaction induces both random quantum noise and a deterministic\ndamping effect to the system, which nudge the system towards a steady state\nthat hovers near the global minimum of objective functions. We theoretically\nprove the convergence of QLD in convex landscapes, demonstrating that the\naverage energy of the system can approach zero in the low temperature limit\nwith an exponential decay rate correlated with the evolution time. Numerically,\nwe first show the energy dissipation capability of QLD by retracing its origins\nto spontaneous emission. Furthermore, we conduct detailed discussion of the\nimpact of each parameter. Finally, based on the observations when comparing QLD\nwith classical Fokker-Plank-Smoluchowski equation, we propose a time-dependent\nQLD by making temperature and $\\hbar$ time-dependent parameters, which can be\ntheoretically proven to converge better than the time-independent case and also\noutperforms a series of state-of-the-art quantum and classical optimization\nalgorithms in many non-convex landscapes.\n","authors":["Zherui Chen","Yuchen Lu","Hao Wang","Yizhou Liu","Tongyang Li"],"pdf_url":"https://arxiv.org/pdf/2311.15587v1.pdf","comment":"33 pages, 1 table, 26 figures"},{"id":"http://arxiv.org/abs/2311.14412v2","updated":"2023-11-27T07:20:42Z","published":"2023-11-24T11:12:26Z","title":"A Comparison of PDF Projection with Normalizing Flows and SurVAE","summary":" Normalizing flows (NF) recently gained attention as a way to construct\ngenerative networks with exact likelihood calculation out of composable layers.\nHowever, NF is restricted to dimension-preserving transformations. Surjection\nVAE (SurVAE) has been proposed to extend NF to dimension-altering\ntransformations. Such networks are desirable because they are expressive and\ncan be precisely trained. We show that the approaches are a re-invention of PDF\nprojection, which appeared over twenty years earlier and is much further\ndeveloped.\n","authors":["Paul M. Baggenstoss","Felix Govaers"],"pdf_url":"https://arxiv.org/pdf/2311.14412v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15584v1","updated":"2023-11-27T07:19:41Z","published":"2023-11-27T07:19:41Z","title":"A deep learning approach for marine snow synthesis and removal","summary":" Marine snow, the floating particles in underwater images, severely degrades\nthe visibility and performance of human and machine vision systems. This paper\nproposes a novel method to reduce the marine snow interference using deep\nlearning techniques. We first synthesize realistic marine snow samples by\ntraining a Generative Adversarial Network (GAN) model and combine them with\nnatural underwater images to create a paired dataset. We then train a U-Net\nmodel to perform marine snow removal as an image to image translation task. Our\nexperiments show that the U-Net model can effectively remove both synthetic and\nnatural marine snow with high accuracy, outperforming state-of-the-art methods\nsuch as the Median filter and its adaptive variant. We also demonstrate the\nrobustness of our method by testing it on the MSRB dataset, which contains\nsynthetic artifacts that our model has not seen during training. Our method is\na practical and efficient solution for enhancing underwater images affected by\nmarine snow.\n","authors":["Fernando Galetto","Guang Deng"],"pdf_url":"https://arxiv.org/pdf/2311.15584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15583v1","updated":"2023-11-27T07:19:23Z","published":"2023-11-27T07:19:23Z","title":"A Simple Geometric-Aware Indoor Positioning Interpolation Algorithm\n Based on Manifold Learning","summary":" Interpolation methodologies have been widely used within the domain of indoor\npositioning systems. However, existing indoor positioning interpolation\nalgorithms exhibit several inherent limitations, including reliance on complex\nmathematical models, limited flexibility, and relatively low precision. To\nenhance the accuracy and efficiency of indoor positioning interpolation\ntechniques, this paper proposes a simple yet powerful geometric-aware\ninterpolation algorithm for indoor positioning tasks. The key to our algorithm\nis to exploit the geometric attributes of the local topological manifold using\nmanifold learning principles. Therefore, instead of constructing complicated\nmathematical models, the proposed algorithm facilitates the more precise and\nefficient estimation of points grounded in the local topological manifold.\nMoreover, our proposed method can be effortlessly integrated into any indoor\npositioning system, thereby bolstering its adaptability. Through a systematic\narray of experiments and comprehensive performance analyses conducted on both\nsimulated and real-world datasets, we demonstrate that the proposed algorithm\nconsistently outperforms the most commonly used and representative\ninterpolation approaches regarding interpolation accuracy and efficiency.\nFurthermore, the experimental results also underscore the substantial practical\nutility of our method and its potential applicability in real-time indoor\npositioning scenarios.\n","authors":["Suorong Yang","Geng Zhang","Jian Zhao","Furao Shen"],"pdf_url":"https://arxiv.org/pdf/2311.15583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15582v1","updated":"2023-11-27T07:19:22Z","published":"2023-11-27T07:19:22Z","title":"Lightly Weighted Automatic Audio Parameter Extraction for the Quality\n Assessment of Consensus Auditory-Perceptual Evaluation of Voice","summary":" The Consensus Auditory-Perceptual Evaluation of Voice is a widely employed\ntool in clinical voice quality assessment that is significant for streaming\ncommunication among clinical professionals and benchmarking for the\ndetermination of further treatment. Currently, because the assessment relies on\nexperienced clinicians, it tends to be inconsistent, and thus, difficult to\nstandardize. To address this problem, we propose to leverage lightly weighted\nautomatic audio parameter extraction, to increase the clinical relevance,\nreduce the complexity, and enhance the interpretability of voice quality\nassessment. The proposed method utilizes age, sex, and five audio parameters:\njitter, absolute jitter, shimmer, harmonic-to-noise ratio (HNR), and zero\ncrossing. A classical machine learning approach is employed. The result reveals\nthat our approach performs similar to state-of-the-art (SOTA) methods, and\noutperforms the latent representation obtained by using popular audio\npre-trained models. This approach provide insights into the feasibility of\ndifferent feature extraction approaches for voice evaluation. Audio parameters\nsuch as jitter and the HNR are proven to be suitable for characterizing voice\nquality attributes, such as roughness and strain. Conversely, pre-trained\nmodels exhibit limitations in effectively addressing noise-related scorings.\nThis study contributes toward more comprehensive and precise voice quality\nevaluations, achieved by a comprehensively exploring diverse assessment\nmethodologies.\n","authors":["Yi-Heng Lin","Wen-Hsuan Tseng","Li-Chin Chen","Ching-Ting Tan","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2311.15582v1.pdf","comment":"Published in IEEE 42th International Conference on Consumer\n Electronics (ICCE 2024)"},{"id":"http://arxiv.org/abs/2311.15578v1","updated":"2023-11-27T07:11:47Z","published":"2023-11-27T07:11:47Z","title":"Experimental Analysis of Large-scale Learnable Vector Storage\n Compression","summary":" Learnable embedding vector is one of the most important applications in\nmachine learning, and is widely used in various database-related domains.\nHowever, the high dimensionality of sparse data in recommendation tasks and the\nhuge volume of corpus in retrieval-related tasks lead to a large memory\nconsumption of the embedding table, which poses a great challenge to the\ntraining and deployment of models. Recent research has proposed various methods\nto compress the embeddings at the cost of a slight decrease in model quality or\nthe introduction of other overheads. Nevertheless, the relative performance of\nthese methods remains unclear. Existing experimental comparisons only cover a\nsubset of these methods and focus on limited metrics. In this paper, we perform\na comprehensive comparative analysis and experimental evaluation of embedding\ncompression. We introduce a new taxonomy that categorizes these techniques\nbased on their characteristics and methodologies, and further develop a modular\nbenchmarking framework that integrates 14 representative methods. Under a\nuniform test environment, our benchmark fairly evaluates each approach,\npresents their strengths and weaknesses under different memory budgets, and\nrecommends the best method based on the use case. In addition to providing\nuseful guidelines, our study also uncovers the limitations of current methods\nand suggests potential directions for future research.\n","authors":["Hailin Zhang","Penghao Zhao","Xupeng Miao","Yingxia Shao","Zirui Liu","Tong Yang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2311.15578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11509v2","updated":"2023-11-27T06:53:03Z","published":"2023-11-20T03:17:21Z","title":"Token-Level Adversarial Prompt Detection Based on Perplexity Measures\n and Contextual Information","summary":" In recent years, Large Language Models (LLM) have emerged as pivotal tools in\nvarious applications. However, these models are susceptible to adversarial\nprompt attacks, where attackers can carefully curate input strings that lead to\nundesirable outputs. The inherent vulnerability of LLMs stems from their\ninput-output mechanisms, especially when presented with intensely\nout-of-distribution (OOD) inputs. This paper proposes a token-level detection\nmethod to identify adversarial prompts, leveraging the LLM's capability to\npredict the next token's probability. We measure the degree of the model's\nperplexity and incorporate neighboring token information to encourage the\ndetection of contiguous adversarial prompt sequences. As a result, we propose\ntwo methods: one that identifies each token as either being part of an\nadversarial prompt or not, and another that estimates the probability of each\ntoken being part of an adversarial prompt.\n","authors":["Zhengmian Hu","Gang Wu","Saayan Mitra","Ruiyi Zhang","Tong Sun","Heng Huang","Viswanathan Swaminathan"],"pdf_url":"https://arxiv.org/pdf/2311.11509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15570v1","updated":"2023-11-27T06:38:07Z","published":"2023-11-27T06:38:07Z","title":"UFDA: Universal Federated Domain Adaptation with Practical Assumptions","summary":" Conventional Federated Domain Adaptation (FDA) approaches usually demand an\nabundance of assumptions, such as label set consistency, which makes them\nsignificantly less feasible for real-world situations and introduces security\nhazards. In this work, we propose a more practical scenario named Universal\nFederated Domain Adaptation (UFDA). It only requires the black-box model and\nthe label set information of each source domain, while the label sets of\ndifferent source domains could be inconsistent and the target-domain label set\nis totally blind. This relaxes the assumptions made by FDA, which are often\nchallenging to meet in real-world cases and diminish model security. To address\nthe UFDA scenario, we propose a corresponding framework called Hot-Learning\nwith Contrastive Label Disambiguation (HCLD), which tackles UFDA's domain\nshifts and category gaps problem by using one-hot outputs from the black-box\nmodels of various source domains. Moreover, to better distinguish the shared\nand unknown classes, we further present a cluster-level strategy named\nMutual-Voting Decision (MVD) to extract robust consensus knowledge across peer\nclasses from both source and target domains. The extensive experiments on three\nbenchmarks demonstrate that our HCLD achieves comparable performance for our\nUFDA scenario with much fewer assumptions, compared to the previous\nmethodologies with many additional assumptions.\n","authors":["Xinhui Liu","Zhenghao Chen","Luping Zhou","Dong Xu","Wei Xi","Gairui Bai","Yihan Zhao","Jizhong Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.15570v1.pdf","comment":"Submitted to AAAI2024"},{"id":"http://arxiv.org/abs/2311.15566v1","updated":"2023-11-27T06:31:17Z","published":"2023-11-27T06:31:17Z","title":"SpotServe: Serving Generative Large Language Models on Preemptible\n Instances","summary":" The high computational and memory requirements of generative large language\nmodels (LLMs) make it challenging to serve them cheaply. This paper aims to\nreduce the monetary cost for serving LLMs by leveraging preemptible GPU\ninstances on modern clouds, which offer accesses to spare GPUs at a much\ncheaper price than regular instances but may be preempted by the cloud at any\ntime. Serving LLMs on preemptible instances requires addressing challenges\ninduced by frequent instance preemptions and the necessity of migrating\ninstances to handle these preemptions.\n This paper presents SpotServe, the first distributed LLM serving system on\npreemptible instances. Several key techniques in SpotServe realize fast and\nreliable serving of generative LLMs on cheap preemptible instances. First,\nSpotServe dynamically adapts the LLM parallelization configuration for dynamic\ninstance availability and fluctuating workload, while balancing the trade-off\namong the overall throughput, inference latency and monetary costs. Second, to\nminimize the cost of migrating instances for dynamic reparallelization, the\ntask of migrating instances is formulated as a bipartite graph matching\nproblem, which uses the Kuhn-Munkres algorithm to identify an optimal migration\nplan that minimizes communications. Finally, to take advantage of the grace\nperiod offered by modern clouds, we introduce stateful inference recovery, a\nnew inference mechanism that commits inference progress at a much finer\ngranularity and allows SpotServe to cheaply resume inference upon preemption.\nWe evaluate on real spot instance preemption traces and various popular LLMs\nand show that SpotServe can reduce the P99 tail latency by 2.4 - 9.1x compared\nwith the best existing LLM serving systems. We also show that SpotServe can\nleverage the price advantage of preemptive instances, saving 54% monetary cost\ncompared with only using on-demand instances.\n","authors":["Xupeng Miao","Chunan Shi","Jiangfei Duan","Xiaoli Xi","Dahua Lin","Bin Cui","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2311.15566v1.pdf","comment":"ASPLOS 2024"},{"id":"http://arxiv.org/abs/2311.15565v1","updated":"2023-11-27T06:26:53Z","published":"2023-11-27T06:26:53Z","title":"Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing\n AI-Generated Text","summary":" My research investigates the use of cutting-edge hybrid deep learning models\nto accurately differentiate between AI-generated text and human writing. I\napplied a robust methodology, utilising a carefully selected dataset comprising\nAI and human texts from various sources, each tagged with instructions.\nAdvanced natural language processing techniques facilitated the analysis of\ntextual features. Combining sophisticated neural networks, the custom model\nenabled it to detect nuanced differences between AI and human content.\n","authors":["Finbarrs Oketunji"],"pdf_url":"https://arxiv.org/pdf/2311.15565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00341v2","updated":"2023-11-27T05:51:13Z","published":"2023-11-01T07:21:08Z","title":"The Open DAC 2023 Dataset and Challenges for Sorbent Discovery in Direct\n Air Capture","summary":" New methods for carbon dioxide removal are urgently needed to combat global\nclimate change. Direct air capture (DAC) is an emerging technology to capture\ncarbon dioxide directly from ambient air. Metal-organic frameworks (MOFs) have\nbeen widely studied as potentially customizable adsorbents for DAC. However,\ndiscovering promising MOF sorbents for DAC is challenging because of the vast\nchemical space to explore and the need to understand materials as functions of\nhumidity and temperature. We explore a computational approach benefiting from\nrecent innovations in machine learning (ML) and present a dataset named Open\nDAC 2023 (ODAC23) consisting of more than 38M density functional theory (DFT)\ncalculations on more than 8,400 MOF materials containing adsorbed $CO_2$ and/or\n$H_2O$. ODAC23 is by far the largest dataset of MOF adsorption calculations at\nthe DFT level of accuracy currently available. In addition to probing\nproperties of adsorbed molecules, the dataset is a rich source of information\non structural relaxation of MOFs, which will be useful in many contexts beyond\nspecific applications for DAC. A large number of MOFs with promising properties\nfor DAC are identified directly in ODAC23. We also trained state-of-the-art ML\nmodels on this dataset to approximate calculations at the DFT level. This\nopen-source dataset and our initial ML models will provide an important\nbaseline for future efforts to identify MOFs for a wide range of applications,\nincluding DAC.\n","authors":["Anuroop Sriram","Sihoon Choi","Xiaohan Yu","Logan M. Brabson","Abhishek Das","Zachary Ulissi","Matt Uyttendaele","Andrew J. Medford","David S. Sholl"],"pdf_url":"https://arxiv.org/pdf/2311.00341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15551v1","updated":"2023-11-27T05:35:49Z","published":"2023-11-27T05:35:49Z","title":"Instruct2Attack: Language-Guided Semantic Adversarial Attacks","summary":" We propose Instruct2Attack (I2A), a language-guided semantic attack that\ngenerates semantically meaningful perturbations according to free-form language\ninstructions. We make use of state-of-the-art latent diffusion models, where we\nadversarially guide the reverse diffusion process to search for an adversarial\nlatent code conditioned on the input image and text instruction. Compared to\nexisting noise-based and semantic attacks, I2A generates more natural and\ndiverse adversarial examples while providing better controllability and\ninterpretability. We further automate the attack process with GPT-4 to generate\ndiverse image-specific text instructions. We show that I2A can successfully\nbreak state-of-the-art deep neural networks even under strong adversarial\ndefenses, and demonstrate great transferability among a variety of network\narchitectures.\n","authors":["Jiang Liu","Chen Wei","Yuxiang Guo","Heng Yu","Alan Yuille","Soheil Feizi","Chun Pong Lau","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2311.15551v1.pdf","comment":"under submission, code coming soon"},{"id":"http://arxiv.org/abs/2311.15549v1","updated":"2023-11-27T05:29:43Z","published":"2023-11-27T05:29:43Z","title":"From Prediction to Action: The Critical Role of Proper Performance\n Estimation for Machine-Learning-Driven Materials Discovery","summary":" Materials discovery driven by statistical property models is an iterative\ndecision process, during which an initial data collection is extended with new\ndata proposed by a model-informed acquisition function--with the goal to\nmaximize a certain \"reward\" over time, such as the maximum property value\ndiscovered so far. While the materials science community achieved much progress\nin developing property models that predict well on average with respect to the\ntraining distribution, this form of in-distribution performance measurement is\nnot directly coupled with the discovery reward. This is because an iterative\ndiscovery process has a shifting reward distribution that is\nover-proportionally determined by the model performance for exceptional\nmaterials. We demonstrate this problem using the example of bulk modulus\nmaximization among double perovskite oxides. We find that the in-distribution\npredictive performance suggests random forests as superior to Gaussian process\nregression, while the results are inverse in terms of the discovery rewards. We\nargue that the lack of proper performance estimation methods from pre-computed\ndata collections is a fundamental problem for improving data-driven materials\ndiscovery, and we propose a novel such estimator that, in contrast to na\\\"ive\nreward estimation, successfully predicts Gaussian processes with the \"expected\nimprovement\" acquisition function as the best out of four options in our\ndemonstrational study for double perovskites. Importantly, it does so without\nrequiring the over thousand ab initio computations that were needed to confirm\nthis prediction.\n","authors":["Mario Boley","Felix Luong","Simon Teshuva","Daniel F Schmidt","Lucas Foppa","Matthias Scheffler"],"pdf_url":"https://arxiv.org/pdf/2311.15549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15548v1","updated":"2023-11-27T05:27:13Z","published":"2023-11-27T05:27:13Z","title":"Deficiency of Large Language Models in Finance: An Empirical Examination\n of Hallucination","summary":" The hallucination issue is recognized as a fundamental deficiency of large\nlanguage models (LLMs), especially when applied to fields such as finance,\neducation, and law. Despite the growing concerns, there has been a lack of\nempirical investigation. In this paper, we provide an empirical examination of\nLLMs' hallucination behaviors in financial tasks. First, we empirically\ninvestigate LLM model's ability of explaining financial concepts and\nterminologies. Second, we assess LLM models' capacity of querying historical\nstock prices. Third, to alleviate the hallucination issue, we evaluate the\nefficacy of four practical methods, including few-shot learning, Decoding by\nContrasting Layers (DoLa), the Retrieval Augmentation Generation (RAG) method\nand the prompt-based tool learning method for a function to generate a query\ncommand. Finally, our major finding is that off-the-shelf LLMs experience\nserious hallucination behaviors in financial tasks. Therefore, there is an\nurgent need to call for research efforts in mitigating LLMs' hallucination.\n","authors":["Haoqiang Kang","Xiao-Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15547v1","updated":"2023-11-27T05:23:01Z","published":"2023-11-27T05:23:01Z","title":"Dataset Distillation in Latent Space","summary":" Dataset distillation (DD) is a newly emerging research area aiming at\nalleviating the heavy computational load in training models on large datasets.\nIt tries to distill a large dataset into a small and condensed one so that\nmodels trained on the distilled dataset can perform comparably with those\ntrained on the full dataset when performing downstream tasks. Among the\nprevious works in this area, there are three key problems that hinder the\nperformance and availability of the existing DD methods: high time complexity,\nhigh space complexity, and low info-compactness. In this work, we\nsimultaneously attempt to settle these three problems by moving the DD\nprocesses from conventionally used pixel space to latent space. Encoded by a\npretrained generic autoencoder, latent codes in the latent space are naturally\ninfo-compact representations of the original images in much smaller sizes.\nAfter transferring three mainstream DD algorithms to latent space, we\nsignificantly reduce time and space consumption while achieving similar\nperformance, allowing us to distill high-resolution datasets or target at\ngreater data ratio that previous methods have failed. Besides, within the same\nstorage budget, we can also quantitatively deliver more latent codes than\npixel-level images, which further boosts the performance of our methods.\n","authors":["Yuxuan Duan","Jianfu Zhang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15547v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.15545v1","updated":"2023-11-27T05:21:08Z","published":"2023-11-27T05:21:08Z","title":"Out-of-Distribution Generalized Dynamic Graph Neural Network for Human\n Albumin Prediction","summary":" Human albumin is essential for indicating the body's overall health.\nAccurately predicting plasma albumin levels and determining appropriate doses\nare urgent clinical challenges, particularly in critically ill patients, to\nmaintain optimal blood levels. However, human albumin prediction is non-trivial\nthat has to leverage the dynamics of biochemical markers as well as the\nexperience of treating patients. Moreover, the problem of distribution shift is\noften encountered in real clinical data, which may lead to a decline in the\nmodel prediction performance and reduce the reliability of the model's\napplication. In this paper, we propose a framework named Out-of-Distribution\nGeneralized Dynamic Graph Neural Network for Human Albumin Prediction\n(DyG-HAP), which is able to provide accurate albumin predictions for Intensity\nCare Unit (ICU) patients during hospitalization. We first model human albumin\nprediction as a dynamic graph regression problem to model the dynamics and\npatient relationship. Then, we propose a disentangled dynamic graph attention\nmechanism to capture and disentangle the patterns whose relationship to labels\nunder distribution shifts is invariant and variant respectively. Last, we\npropose an invariant dynamic graph regression method to encourage the model to\nrely on invariant patterns to make predictions. Moreover, we propose a dataset\nnamed Albumin level testing and nutritional dosing data for Intensive Care\n(ANIC) for evaluation. Extensive experiments demonstrate the superiority of our\nmethod compared to several baseline methods in human albumin prediction.\n","authors":["Zeyang Zhang","Xingwang Li","Fei Teng","Ning Lin","Xueling Zhu","Xin Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.15545v1.pdf","comment":"MedAI'23"},{"id":"http://arxiv.org/abs/2309.01947v2","updated":"2023-11-27T05:03:31Z","published":"2023-09-05T04:47:55Z","title":"TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression\n For On-device ASR Models","summary":" Automatic Speech Recognition (ASR) models need to be optimized for specific\nhardware before they can be deployed on devices. This can be done by tuning the\nmodel's hyperparameters or exploring variations in its architecture.\nRe-training and re-validating models after making these changes can be a\nresource-intensive task. This paper presents TODM (Train Once Deploy Many), a\nnew approach to efficiently train many sizes of hardware-friendly on-device ASR\nmodels with comparable GPU-hours to that of a single training job. TODM\nleverages insights from prior work on Supernet, where Recurrent Neural Network\nTransducer (RNN-T) models share weights within a Supernet. It reduces layer\nsizes and widths of the Supernet to obtain subnetworks, making them smaller\nmodels suitable for all hardware types. We introduce a novel combination of\nthree techniques to improve the outcomes of the TODM Supernet: adaptive\ndropouts, an in-place Alpha-divergence knowledge distillation, and the use of\nScaledAdam optimizer. We validate our approach by comparing Supernet-trained\nversus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using\nLibriSpeech. Results demonstrate that our TODM Supernet either matches or\nsurpasses the performance of manually tuned models by up to a relative of 3%\nbetter in word error rate (WER), while efficiently keeping the cost of training\nmany models at a small constant.\n","authors":["Yuan Shangguan","Haichuan Yang","Danni Li","Chunyang Wu","Yassir Fathullah","Dilin Wang","Ayushi Dalmia","Raghuraman Krishnamoorthi","Ozlem Kalinli","Junteng Jia","Jay Mahadeokar","Xin Lei","Mike Seltzer","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.01947v2.pdf","comment":"Meta AI; Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2308.16781v4","updated":"2023-11-27T05:03:14Z","published":"2023-08-31T14:59:32Z","title":"StratMed: Relevance Stratification between Biomedical Entities for\n Sparsity on Medication Recommendation","summary":" With the growing imbalance between limited medical resources and escalating\ndemands, AI-based clinical tasks have become paramount. As a sub-domain,\nmedication recommendation aims to amalgamate longitudinal patient history with\nmedical knowledge, assisting physicians in prescribing safer and more accurate\nmedication combinations. Existing works ignore the inherent long-tailed\ndistribution of medical data, have uneven learning strengths for hot and sparse\ndata, and fail to balance safety and accuracy. To address the above\nlimitations, we propose StratMed, which introduces a stratification strategy\nthat overcomes the long-tailed problem and achieves fuller learning of sparse\ndata. It also utilizes a dual-property network to address the issue of mutual\nconstraints on the safety and accuracy of medication combinations,\nsynergistically enhancing these two properties. Specifically, we construct a\npre-training method using deep learning networks to obtain medication and\ndisease representations. After that, we design a pyramid-like stratification\nmethod based on relevance to strengthen the expressiveness of sparse data.\nBased on this relevance, we design two graph structures to express medication\nsafety and precision at the same level to obtain patient representations.\nFinally, the patient's historical clinical information is fitted to generate\nmedication combinations for the current health condition. We employed the\nMIMIC-III dataset to evaluate our model against state-of-the-art methods in\nthree aspects comprehensively. Compared to the sub-optimal baseline model, our\nmodel reduces safety risk by 15.08\\%, improves accuracy by 0.36\\%, and reduces\ntraining time consumption by 81.66\\%.\n","authors":["Xiang Li","Shunpan Liang","Yulei Hou","Tengfei Ma"],"pdf_url":"https://arxiv.org/pdf/2308.16781v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15536v1","updated":"2023-11-27T04:49:24Z","published":"2023-11-27T04:49:24Z","title":"SVRDA: A Web-based Dataset Annotation Tool for Slice-to-Volume\n Registration","summary":" Background and Objective: The lack of benchmark datasets has impeded the\ndevelopment of slice-to-volume registration algorithms. Such datasets are\ndifficult to annotate, primarily due to the dimensional difference within data\nand the dearth of task-specific software. We aim to develop a user-friendly\ntool to streamline dataset annotation for slice-to-volume registration.\n Methods: The proposed tool, named SVRDA, is an installation-free web\napplication for platform-agnostic collaborative dataset annotation. It enables\nefficient transformation manipulation via keyboard shortcuts and smooth case\ntransitions with auto-saving. SVRDA supports configuration-based data loading\nand adheres to the separation of concerns, offering great flexibility and\nextensibility for future research. Various supplementary features have been\nimplemented to facilitate slice-to-volume registration.\n Results: We validated the effectiveness of SVRDA by indirectly evaluating the\npost-registration segmentation quality on UK Biobank data, observing a dramatic\noverall improvement (24.02% in the Dice Similarity Coefficient and 48.93% in\nthe 95th percentile Hausdorff distance, respectively) supported by highly\nstatistically significant evidence ($p<0.001$).We further showcased the\nclinical usage of SVRDA by integrating it into test-retest T1 quantification on\nin-house magnetic resonance images, leading to more consistent results after\nregistration.\n Conclusions: SVRDA can facilitate collaborative annotation of benchmark\ndatasets while being potentially applicable to other pipelines incorporating\nslice-to-volume registration. Full source code and documentation are available\nat https://github.com/Roldbach/SVRDA\n","authors":["Weixun Luo","Alexandre Triay Bagur","Paul Aljabar","George Ralli","Sir Michael Brady"],"pdf_url":"https://arxiv.org/pdf/2311.15536v1.pdf","comment":"18 pages, 11 figures, In submission to Computer Methods and Programs\n in Biomedicine"},{"id":"http://arxiv.org/abs/2205.13748v2","updated":"2023-11-27T04:41:51Z","published":"2022-05-27T03:24:31Z","title":"Auto-PINN: Understanding and Optimizing Physics-Informed Neural\n Architecture","summary":" Physics-informed neural networks (PINNs) are revolutionizing science and\nengineering practice by bringing together the power of deep learning to bear on\nscientific computation. In forward modeling problems, PINNs are meshless\npartial differential equation (PDE) solvers that can handle irregular,\nhigh-dimensional physical domains. Naturally, the neural architecture\nhyperparameters have a large impact on the efficiency and accuracy of the PINN\nsolver. However, this remains an open and challenging problem because of the\nlarge search space and the difficulty of identifying a proper search objective\nfor PDEs. Here, we propose Auto-PINN, the first systematic, automated\nhyperparameter optimization approach for PINNs, which employs Neural\nArchitecture Search (NAS) techniques to PINN design. Auto-PINN avoids manually\nor exhaustively searching the hyperparameter space associated with PINNs. A\ncomprehensive set of pre-experiments using standard PDE benchmarks allows us to\nprobe the structure-performance relationship in PINNs. We find that the\ndifferent hyperparameters can be decoupled, and that the training loss function\nof PINNs is a good search objective. Comparison experiments with baseline\nmethods demonstrate that Auto-PINN produces neural architectures with superior\nstability and accuracy over alternative baselines.\n","authors":["Yicheng Wang","Xiaotian Han","Chia-Yuan Chang","Daochen Zha","Ulisses Braga-Neto","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2205.13748v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15530v1","updated":"2023-11-27T04:23:47Z","published":"2023-11-27T04:23:47Z","title":"SSIN: Self-Supervised Learning for Rainfall Spatial Interpolation","summary":" The acquisition of accurate rainfall distribution in space is an important\ntask in hydrological analysis and natural disaster pre-warning. However, it is\nimpossible to install rain gauges on every corner. Spatial interpolation is a\ncommon way to infer rainfall distribution based on available raingauge data.\nHowever, the existing works rely on some unrealistic pre-settings to capture\nspatial correlations, which limits their performance in real scenarios. To\ntackle this issue, we propose the SSIN, which is a novel data-driven\nself-supervised learning framework for rainfall spatial interpolation by mining\nlatent spatial patterns from historical observation data. Inspired by the Cloze\ntask and BERT, we fully consider the characteristics of spatial interpolation\nand design the SpaFormer model based on the Transformer architecture as the\ncore of SSIN. Our main idea is: by constructing rich self-supervision signals\nvia random masking, SpaFormer can learn informative embeddings for raw data and\nthen adaptively model spatial correlations based on rainfall spatial context.\nExtensive experiments on two real-world raingauge datasets show that our method\noutperforms the state-of-the-art solutions. In addition, we take traffic\nspatial interpolation as another use case to further explore the performance of\nour method, and SpaFormer achieves the best performance on one large real-world\ntraffic dataset, which further confirms the effectiveness and generality of our\nmethod.\n","authors":["Jia Li","Yanyan Shen","Lei Chen","Charles Wang Wai NG"],"pdf_url":"https://arxiv.org/pdf/2311.15530v1.pdf","comment":"SIGMOD 2023 Data-intensive Applications (DIA) Track; Code is\n available at https://github.com/jlidw/SSIN"},{"id":"http://arxiv.org/abs/2308.12532v2","updated":"2023-11-27T03:33:37Z","published":"2023-08-24T03:43:02Z","title":"FedSoL: Bridging Global Alignment and Local Generality in Federated\n Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclient data distributions are heterogeneous. Many previous FL algorithms have\naddressed this issue by introducing various proximal restrictions. These\nrestrictions aim to encourage global alignment by constraining the deviation of\nlocal learning from the global objective. However, they inherently limit local\nlearning by interfering with the original local objectives. Recently, an\nalternative approach has emerged to improve local learning generality. By\nobtaining local models within a smooth loss landscape, this approach mitigates\nconflicts among different local objectives of the clients. Yet, it does not\nensure stable global alignment, as local learning does not take the global\nobjective into account. In this study, we propose Federated Stability on\nLearning (FedSoL), which combines both the concepts of global alignment and\nlocal generality. In FedSoL, the local learning seeks a parameter region robust\nagainst proximal perturbations. This strategy introduces an implicit proximal\nrestriction effect in local learning while maintaining the original local\nobjective for parameter update. Our experiments show that FedSoL consistently\nachieves state-of-the-art performance on various setups.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v2.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2311.15516v1","updated":"2023-11-27T03:25:12Z","published":"2023-11-27T03:25:12Z","title":"Active Foundational Models for Fault Diagnosis of Electrical Motors","summary":" Fault detection and diagnosis of electrical motors are of utmost importance\nin ensuring the safe and reliable operation of several industrial systems.\nDetection and diagnosis of faults at the incipient stage allows corrective\nactions to be taken in order to reduce the severity of faults. The existing\ndata-driven deep learning approaches for machine fault diagnosis rely\nextensively on huge amounts of labeled samples, where annotations are expensive\nand time-consuming. However, a major portion of unlabeled condition monitoring\ndata is not exploited in the training process. To overcome this limitation, we\npropose a foundational model-based Active Learning framework that utilizes less\namount of labeled samples, which are most informative and harnesses a large\namount of available unlabeled data by effectively combining Active Learning and\nContrastive Self-Supervised Learning techniques. It consists of a transformer\nnetwork-based backbone model trained using an advanced nearest-neighbor\ncontrastive self-supervised learning method. This approach empowers the\nbackbone to learn improved representations of samples derived from raw,\nunlabeled vibration data. Subsequently, the backbone can undergo fine-tuning to\naddress a range of downstream tasks, both within the same machines and across\ndifferent machines. The effectiveness of the proposed methodology has been\nassessed through the fine-tuning of the backbone for multiple target tasks\nusing three distinct machine-bearing fault datasets. The experimental\nevaluation demonstrates a superior performance as compared to existing\nstate-of-the-art fault diagnosis methods with less amount of labeled data.\n","authors":["Sriram Anbalagan","Sai Shashank GP","Deepesh Agarwal","Balasubramaniam Natarajan","Babji Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2311.15516v1.pdf","comment":"30 pages, 2 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.15513v1","updated":"2023-11-27T03:17:09Z","published":"2023-11-27T03:17:09Z","title":"A Comparative and Experimental Study on Automatic Question Answering\n Systems and its Robustness against Word Jumbling","summary":" Question answer generation using Natural Language Processing models is\nubiquitous in the world around us. It is used in many use cases such as the\nbuilding of chat bots, suggestive prompts in google search and also as a way of\nnavigating information in banking mobile applications etc. It is highly\nrelevant because a frequently asked questions (FAQ) list can only have a finite\namount of questions but a model which can perform question answer generation\ncould be able to answer completely new questions that are within the scope of\nthe data. This helps us to be able to answer new questions accurately as long\nas it is a relevant question. In commercial applications, it can be used to\nincrease customer satisfaction and ease of usage. However a lot of data is\ngenerated by humans so it is susceptible to human error and this can adversely\naffect the model's performance and we are investigating this through our work\n","authors":["Shashidhar Reddy Javaji","Haoran Hu","Sai Sameer Vennam","Vijaya Gajanan Buddhavarapu"],"pdf_url":"https://arxiv.org/pdf/2311.15513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02858v3","updated":"2023-11-27T03:15:34Z","published":"2023-04-06T04:37:10Z","title":"A review of ensemble learning and data augmentation models for class\n imbalanced problems: combination, implementation and evaluation","summary":" Class imbalance (CI) in classification problems arises when the number of\nobservations belonging to one class is lower than the other. Ensemble learning\ncombines multiple models to obtain a robust model and has been prominently used\nwith data augmentation methods to address class imbalance problems. In the last\ndecade, a number of strategies have been added to enhance ensemble learning and\ndata augmentation methods, along with new methods such as generative\nadversarial networks (GANs). A combination of these has been applied in many\nstudies, and the evaluation of different combinations would enable a better\nunderstanding and guidance for different application domains. In this paper, we\npresent a computational study to evaluate data augmentation and ensemble\nlearning methods used to address prominent benchmark CI problems. We present a\ngeneral framework that evaluates 9 data augmentation and 9 ensemble learning\nmethods for CI problems. Our objective is to identify the most effective\ncombination for improving classification performance on imbalanced datasets.\nThe results indicate that combinations of data augmentation methods with\nensemble learning can significantly improve classification performance on\nimbalanced datasets. We find that traditional data augmentation methods such as\nthe synthetic minority oversampling technique (SMOTE) and random oversampling\n(ROS) are not only better in performance for selected CI problems, but also\ncomputationally less expensive than GANs. Our study is vital for the\ndevelopment of novel models for handling imbalanced datasets.\n","authors":["Azal Ahmad Khan","Omkar Chaudhari","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2304.02858v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09740v3","updated":"2023-11-27T03:09:21Z","published":"2023-11-16T10:13:09Z","title":"Redefining Super-Resolution: Fine-mesh PDE predictions without classical\n simulations","summary":" In Computational Fluid Dynamics (CFD), coarse mesh simulations offer\ncomputational efficiency but often lack precision. Applying conventional\nsuper-resolution to these simulations poses a significant challenge due to the\nfundamental contrast between downsampling high-resolution images and\nauthentically emulating low-resolution physics. The former method conserves\nmore of the underlying physics, surpassing the usual constraints of real-world\nscenarios. We propose a novel definition of super-resolution tailored for\nPDE-based problems. Instead of simply downsampling from a high-resolution\ndataset, we use coarse-grid simulated data as our input and predict fine-grid\nsimulated outcomes. Employing a physics-infused UNet upscaling method, we\ndemonstrate its efficacy across various 2D-CFD problems such as discontinuity\ndetection in Burger's equation, Methane combustion, and fouling in Industrial\nheat exchangers. Our method enables the generation of fine-mesh solutions\nbypassing traditional simulation, ensuring considerable computational saving\nand fidelity to the original ground truth outcomes. Through diverse boundary\nconditions during training, we further establish the robustness of our method,\npaving the way for its broad applications in engineering and scientific CFD\nsolvers.\n","authors":["Rajat Kumar Sarkar","Ritam Majumdar","Vishal Jadhav","Sagar Srinivas Sakhinana","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2311.09740v3.pdf","comment":"Accepted at Machine Learning and the Physical Sciences Workshop,\n NeurIPS 2023"},{"id":"http://arxiv.org/abs/2211.04686v3","updated":"2023-11-27T03:07:32Z","published":"2022-11-09T05:18:08Z","title":"Directional Privacy for Deep Learning","summary":" Differentially Private Stochastic Gradient Descent (DP-SGD) is a key method\nfor applying privacy in the training of deep learning models. It applies\nisotropic Gaussian noise to gradients during training, which can perturb these\ngradients in any direction, damaging utility. Metric DP, however, can provide\nalternative mechanisms based on arbitrary metrics that might be more suitable\nfor preserving utility. In this paper, we apply \\textit{directional privacy},\nvia a mechanism based on the von Mises-Fisher (VMF) distribution, to perturb\ngradients in terms of \\textit{angular distance} so that gradient direction is\nbroadly preserved. We show that this provides both $\\epsilon$-DP and $\\epsilon\nd$-privacy for deep learning training, rather than the $(\\epsilon,\n\\delta)$-privacy of the Gaussian mechanism. Experiments on key datasets then\nindicate that the VMF mechanism can outperform the Gaussian in the\nutility-privacy trade-off. In particular, our experiments provide a direct\nempirical comparison of privacy between the two approaches in terms of their\nability to defend against reconstruction and membership inference.\n","authors":["Pedro Faustini","Natasha Fernandes","Shakila Tonni","Annabelle McIver","Mark Dras"],"pdf_url":"https://arxiv.org/pdf/2211.04686v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15502v1","updated":"2023-11-27T02:59:17Z","published":"2023-11-27T02:59:17Z","title":"Learning with Complementary Labels Revisited: A Consistent Approach via\n Negative-Unlabeled Learning","summary":" Complementary-label learning is a weakly supervised learning problem in which\neach training example is associated with one or multiple complementary labels\nindicating the classes to which it does not belong. Existing consistent\napproaches have relied on the uniform distribution assumption to model the\ngeneration of complementary labels, or on an ordinary-label training set to\nestimate the transition matrix. However, both conditions may not be satisfied\nin real-world scenarios. In this paper, we propose a novel complementary-label\nlearning approach that does not rely on these conditions. We find that\ncomplementary-label learning can be expressed as a set of negative-unlabeled\nbinary classification problems when using the one-versus-rest strategy. This\nobservation allows us to propose a risk-consistent approach with theoretical\nguarantees. Furthermore, we introduce a risk correction approach to address\noverfitting problems when using complex models. We also prove the statistical\nconsistency and convergence rate of the corrected risk estimator. Extensive\nexperimental results on both synthetic and real-world benchmark datasets\nvalidate the superiority of our proposed approach over state-of-the-art\nmethods.\n","authors":["Wei Wang","Takashi Ishida","Yu-Jie Zhang","Gang Niu","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2311.15502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15500v1","updated":"2023-11-27T02:55:34Z","published":"2023-11-27T02:55:34Z","title":"Function-constrained Program Synthesis","summary":" This work introduces (1) a technique that allows large language models (LLMs)\nto leverage user-provided code when solving programming tasks and (2) a method\nto iteratively generate modular sub-functions that can aid future code\ngeneration attempts when the initial code generated by the LLM is inadequate.\nGenerating computer programs in general-purpose programming languages like\nPython poses a challenge for LLMs when instructed to use code provided in the\nprompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code\ncompletions in real-time by drawing on all code available in a development\nenvironment. However, restricting code-specific LLMs to use only in-context\ncode is not straightforward, as the model is not explicitly instructed to use\nthe user-provided code and users cannot highlight precisely which snippets of\ncode the model should incorporate into its context. Moreover, current systems\nlack effective recovery methods, forcing users to iteratively re-prompt the\nmodel with modified prompts until a sufficient solution is reached. Our method\ndiffers from traditional LLM-powered code-generation by constraining\ncode-generation to an explicit function set and enabling recovery from failed\nattempts through automatically generated sub-functions. When the LLM cannot\nproduce working code, we generate modular sub-functions to aid subsequent\nattempts at generating functional code. A by-product of our method is a library\nof reusable sub-functions that can solve related tasks, imitating a software\nteam where efficiency scales with experience. We also introduce a new\n\"half-shot\" evaluation paradigm that provides tighter estimates of LLMs' coding\nabilities compared to traditional zero-shot evaluation. Our proposed evaluation\nmethod encourages models to output solutions in a structured format, decreasing\nsyntax errors that can be mistaken for poor coding ability.\n","authors":["Patrick Hajali","Ignas Budvytis"],"pdf_url":"https://arxiv.org/pdf/2311.15500v1.pdf","comment":"17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop"},{"id":"http://arxiv.org/abs/2311.15497v1","updated":"2023-11-27T02:48:06Z","published":"2023-11-27T02:48:06Z","title":"Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning\n and Optimization Functions for Enhanced Precision","summary":" Image registration has traditionally been done using two distinct approaches:\nlearning based methods, relying on robust deep neural networks, and\noptimization-based methods, applying complex mathematical transformations to\nwarp images accordingly. Of course, both paradigms offer advantages and\ndisadvantages, and, in this work, we seek to combine their respective strengths\ninto a single streamlined framework, using the outputs of the learning based\nmethod as initial parameters for optimization while prioritizing computational\npower for the image pairs that offer the greatest loss. Our investigations\nshowed that an improvement of 0.3\\% in testing when utilizing the best\nperforming state-of-the-art model as the backbone of the framework, while\nmaintaining the same inference time and with only a 0.8\\% loss in deformation\nfield smoothness.\n","authors":["Gabriel De Araujo","Shanlin Sun","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2311.15497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15487v1","updated":"2023-11-27T02:12:02Z","published":"2023-11-27T02:12:02Z","title":"Global $\\mathcal{L}^2$ minimization with certainty via geometrically\n adapted gradient descent in Deep Learning","summary":" We consider the gradient descent flow widely used for the minimization of the\n$\\mathcal{L}^2$ cost function in Deep Learning networks, and introduce two\nmodified versions; one adapted for the overparametrized setting, and the other\nfor the underparametrized setting. Both have a clear and natural invariant\ngeometric meaning, taking into account the pullback vector bundle structure in\nthe overparametrized, and the pushforward vector bundle structure in the\nunderparametrized setting. In the overparametrized case, we prove that,\nprovided that a rank condition holds, all orbits of the modified gradient\ndescent drive the $\\mathcal{L}^2$ cost to its global minimum at a uniform\nexponential convergence rate. We point out relations of the latter to\nsub-Riemannian geometry.\n","authors":["Thomas Chen"],"pdf_url":"https://arxiv.org/pdf/2311.15487v1.pdf","comment":"AMS Latex, 12 pages"},{"id":"http://arxiv.org/abs/2311.16380v1","updated":"2023-11-27T23:56:59Z","published":"2023-11-27T23:56:59Z","title":"Learning Multimodal Latent Dynamics for Human-Robot Interaction","summary":" This article presents a method for learning well-coordinated Human-Robot\nInteraction (HRI) from Human-Human Interactions (HHI). We devise a hybrid\napproach using Hidden Markov Models (HMMs) as the latent space priors for a\nVariational Autoencoder to model a joint distribution over the interacting\nagents. We leverage the interaction dynamics learned from HHI to learn HRI and\nincorporate the conditional generation of robot motions from human observations\ninto the training, thereby predicting more accurate robot trajectories. The\ngenerated robot motions are further adapted with Inverse Kinematics to ensure\nthe desired physical proximity with a human, combining the ease of joint space\nlearning and accurate task space reachability. For contact-rich interactions,\nwe modulate the robot's stiffness using HMM segmentation for a compliant\ninteraction. We verify the effectiveness of our approach deployed on a Humanoid\nrobot via a user study. Our method generalizes well to various humans despite\nbeing trained on data from just two humans. We find that Users perceive our\nmethod as more human-like, timely, and accurate and rank our method with a\nhigher degree of preference over other baselines.\n","authors":["Vignesh Prasad","Lea Heitlinger","Dorothea Koert","Ruth Stock-Homburg","Jan Peters","Georgia Chalvatzaki"],"pdf_url":"https://arxiv.org/pdf/2311.16380v1.pdf","comment":"20 Pages, 10 Figures"},{"id":"http://arxiv.org/abs/2311.16378v1","updated":"2023-11-27T23:53:19Z","published":"2023-11-27T23:53:19Z","title":"Bayesian Formulations for Graph Spectral Denoising","summary":" We consider noisy signals which are defined on the vertices of a graph and\npresent smoothing algorithms for the cases of Gaussian, dropout, and uniformly\ndistributed noise. The signals are assumed to follow a prior distribution\ndefined in the frequency domain which favors signals which are smooth across\nthe edges of the graph. By pairing this prior distribution with our three\nmodels of noise generation, we propose \\textit{Maximum A Posteriori} (M.A.P.)\nestimates of the true signal in the presence of noisy data and provide\nalgorithms for computing the M.A.P. Finally, we demonstrate the algorithms'\nability to effectively restore white noise on image data, and from severe\ndropout in toy \\& EHR data.\n","authors":["Sam Leone","Xingzhi Sun","Michael Perlmutter","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2311.16378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.05760v3","updated":"2023-11-27T23:38:19Z","published":"2022-01-15T05:25:03Z","title":"Big Data Analytics for Network Level Short-Term Travel Time Prediction\n with Hierarchical LSTM","summary":" The travel time data collected from widespread traffic monitoring sensors\nnecessitate big data analytic tools for querying, visualization, and\nidentifying meaningful traffic patterns. This paper utilizes a large-scale\ntravel time dataset from Caltrans Performance Measurement System (PeMS) system\nthat is an overflow for traditional data processing and modeling tools. To\novercome the challenges of the massive amount of data, the big data analytic\nengines Apache Spark and Apache MXNet are applied for data wrangling and\nmodeling. Seasonality and autocorrelation were performed to explore and\nvisualize the trend of time-varying data. Inspired by the success of the\nhierarchical architecture for many Artificial Intelligent (AI) tasks, we\nconsolidate the cell and hidden states passed from low-level to the high-level\nLSTM with an attention pooling similar to how the human perception system\noperates. The designed hierarchical LSTM model can consider the dependencies at\ndifferent time scales to capture the spatial-temporal correlations of\nnetwork-level travel time. Another self-attention module is then devised to\nconnect LSTM extracted features to the fully connected layers, predicting\ntravel time for all corridors instead of a single link/route. The comparison\nresults show that the Hierarchical LSTM with Attention (HierLSTMat) model gives\nthe best prediction results at 30-minute and 45-min horizons and can\nsuccessfully forecast unusual congestion. The efficiency gained from big data\nanalytic tools was evaluated by comparing them with popular data science and\ndeep learning frameworks.\n","authors":["Tianya T. Zhang"],"pdf_url":"https://arxiv.org/pdf/2201.05760v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16374v1","updated":"2023-11-27T23:35:40Z","published":"2023-11-27T23:35:40Z","title":"Physics-Informed Neural Network for Discovering Systems with\n Unmeasurable States with Application to Lithium-Ion Batteries","summary":" Combining machine learning with physics is a trending approach for\ndiscovering unknown dynamics, and one of the most intensively studied\nframeworks is the physics-informed neural network (PINN). However, PINN often\nfails to optimize the network due to its difficulty in concurrently minimizing\nmultiple losses originating from the system's governing equations. This problem\ncan be more serious when the system's states are unmeasurable, like lithium-ion\nbatteries (LiBs). In this work, we introduce a robust method for training PINN\nthat uses fewer loss terms and thus constructs a less complex landscape for\noptimization. In particular, instead of having loss terms from each\ndifferential equation, this method embeds the dynamics into a loss function\nthat quantifies the error between observed and predicted system outputs. This\nis accomplished by numerically integrating the predicted states from the neural\nnetwork(NN) using known dynamics and transforming them to obtain a sequence of\npredicted outputs. Minimizing such a loss optimizes the NN to predict states\nconsistent with observations given the physics. Further, the system's\nparameters can be added to the optimization targets. To demonstrate the ability\nof this method to perform various modeling and control tasks, we apply it to a\nbattery model to concurrently estimate its states and parameters.\n","authors":["Yuichi Kajiura","Jorge Espin","Dong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.16374v1.pdf","comment":"7 pages, 4 figure, submitted to American Control Conference 2024"},{"id":"http://arxiv.org/abs/2111.10085v4","updated":"2023-11-27T23:25:00Z","published":"2021-11-19T08:02:38Z","title":"Mate! Are You Really Aware? An Explainability-Guided Testing Framework\n for Robustness of Malware Detectors","summary":" Numerous open-source and commercial malware detectors are available. However,\ntheir efficacy is threatened by new adversarial attacks, whereby malware\nattempts to evade detection, e.g., by performing feature-space manipulation. In\nthis work, we propose an explainability-guided and model-agnostic testing\nframework for robustness of malware detectors when confronted with adversarial\nattacks. The framework introduces the concept of Accrued Malicious Magnitude\n(AMM) to identify which malware features could be manipulated to maximize the\nlikelihood of evading detection. We then use this framework to test several\nstate-of-the-art malware detectors' abilities to detect manipulated malware. We\nfind that (i) commercial antivirus engines are vulnerable to AMM-guided test\ncases; (ii) the ability of a manipulated malware generated using one detector\nto evade detection by another detector (i.e., transferability) depends on the\noverlap of features with large AMM values between the different detectors; and\n(iii) AMM values effectively measure the fragility of features (i.e.,\ncapability of feature-space manipulation to flip the prediction results) and\nexplain the robustness of malware detectors facing evasion attacks. Our\nfindings shed light on the limitations of current malware detectors, as well as\nhow they can be improved.\n","authors":["Ruoxi Sun","Minhui Xue","Gareth Tyson","Tian Dong","Shaofeng Li","Shuo Wang","Haojin Zhu","Seyit Camtepe","Surya Nepal"],"pdf_url":"https://arxiv.org/pdf/2111.10085v4.pdf","comment":"Accepted at ESEC/FSE 2023. https://doi.org/10.1145/3611643.3616309"},{"id":"http://arxiv.org/abs/2307.04870v4","updated":"2023-11-27T23:21:33Z","published":"2023-07-10T19:34:41Z","title":"RACH-Space: Reconstructing Adaptive Convex Hull Space with applications\n in weak supervision","summary":" We introduce RACH-Space, a novel classification method in ensemble learning.\nIn particular, we show its applicability as a label model for weakly supervised\nlearning. RACH-Space offers simplicity in implementation with minimal\nassumptions on the data or weak signals. The model is well suited for scenarios\nwhere fully labeled data is not available. Our method is built upon geometrical\ninterpretation of the space spanned by weak signals. Our analysis of the high\ndimensional convex hull structure underlying general set of weak signals\nbridges geometry with machine learning. Empirical results also demonstrate that\nRACH-Space works well in practice and compares favorably to best existing label\nmodels for weakly supervised learning.\n","authors":["Woojoo Na"],"pdf_url":"https://arxiv.org/pdf/2307.04870v4.pdf","comment":"11 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2304.02970v4","updated":"2023-11-27T13:11:20Z","published":"2023-04-06T09:54:06Z","title":"A Closer Look at Audio-Visual Segmentation","summary":" Audio-visual segmentation (AVS) is a complex task that involves accurately\nsegmenting the corresponding sounding object based on audio-visual queries.\nSuccessful audio-visual learning requires two essential components: 1) an\nunbiased dataset with high-quality pixel-level multi-class labels, and 2) a\nmodel capable of effectively linking audio information with its corresponding\nvisual object. However, these two requirements are only partially addressed by\ncurrent methods, with training sets containing biased audio-visual data, and\nmodels that generalise poorly beyond this biased training set. In this work, we\npropose a new strategy to build cost-effective and relatively unbiased\naudio-visual semantic segmentation benchmarks. Our strategy, called Visual\nPost-production (VPO), explores the observation that it is not necessary to\nhave explicit audio-visual pairs extracted from single video sources to build\nsuch benchmarks. We also refine the previously proposed AVSBench to transform\nit into the audio-visual semantic segmentation benchmark AVSBench-Single+.\nFurthermore, this paper introduces a new pixel-wise audio-visual contrastive\nlearning method to enable a better generalisation of the model beyond the\ntraining set. We verify the validity of the VPO strategy by showing that\nstate-of-the-art (SOTA) models trained with datasets built by matching audio\nand visual data from different sources or with datasets containing audio and\nvisual data from the same video source produce almost the same accuracy. Then,\nusing the proposed VPO benchmarks and AVSBench-Single+, we show that our method\nproduces more accurate audio-visual semantic segmentation than SOTA models.\nCode and dataset will be available.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12454v2","updated":"2023-11-27T12:26:32Z","published":"2023-11-21T09:07:11Z","title":"HierSpeech++: Bridging the Gap between Semantic and Acoustic\n Representation of Speech by Hierarchical Variational Inference for Zero-shot\n Speech Synthesis","summary":" Large language models (LLM)-based speech synthesis has been widely adopted in\nzero-shot speech synthesis. However, they require a large-scale data and\npossess the same limitations as previous autoregressive speech models,\nincluding slow inference speed and lack of robustness. This paper proposes\nHierSpeech++, a fast and strong zero-shot speech synthesizer for text-to-speech\n(TTS) and voice conversion (VC). We verified that hierarchical speech synthesis\nframeworks could significantly improve the robustness and expressiveness of the\nsynthetic speech. Furthermore, we significantly improve the naturalness and\nspeaker similarity of synthetic speech even in zero-shot speech synthesis\nscenarios. For text-to-speech, we adopt the text-to-vec framework, which\ngenerates a self-supervised speech representation and an F0 representation\nbased on text representations and prosody prompts. Then, HierSpeech++ generates\nspeech from the generated vector, F0, and voice prompt. We further introduce a\nhigh-efficient speech super-resolution framework from 16 kHz to 48 kHz. The\nexperimental results demonstrated that the hierarchical variational autoencoder\ncould be a strong zero-shot speech synthesizer given that it outperforms\nLLM-based and diffusion-based models. Moreover, we achieved the first\nhuman-level quality zero-shot speech synthesis. Audio samples and source code\nare available at https://github.com/sh-lee-prml/HierSpeechpp.\n","authors":["Sang-Hoon Lee","Ha-Yeong Choi","Seung-Bin Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2311.12454v2.pdf","comment":"16 pages, 9 figures, 12 tables"},{"id":"http://arxiv.org/abs/2309.07983v2","updated":"2023-11-27T11:54:56Z","published":"2023-09-14T18:40:28Z","title":"SLMIA-SR: Speaker-Level Membership Inference Attacks against Speaker\n Recognition Systems","summary":" Membership inference attacks allow adversaries to determine whether a\nparticular example was contained in the model's training dataset. While\nprevious works have confirmed the feasibility of such attacks in various\napplications, none has focused on speaker recognition (SR), a promising\nvoice-based biometric recognition technique. In this work, we propose SLMIA-SR,\nthe first membership inference attack tailored to SR. In contrast to\nconventional example-level attack, our attack features speaker-level membership\ninference, i.e., determining if any voices of a given speaker, either the same\nas or different from the given inference voices, have been involved in the\ntraining of a model. It is particularly useful and practical since the training\nand inference voices are usually distinct, and it is also meaningful\nconsidering the open-set nature of SR, namely, the recognition speakers were\noften not present in the training data. We utilize intra-similarity and\ninter-dissimilarity, two training objectives of SR, to characterize the\ndifferences between training and non-training speakers and quantify them with\ntwo groups of features driven by carefully-established feature engineering to\nmount the attack. To improve the generalizability of our attack, we propose a\nnovel mixing ratio training strategy to train attack models. To enhance the\nattack performance, we introduce voice chunk splitting to cope with the limited\nnumber of inference voices and propose to train attack models dependent on the\nnumber of inference voices. Our attack is versatile and can work in both\nwhite-box and black-box scenarios. Additionally, we propose two novel\ntechniques to reduce the number of black-box queries while maintaining the\nattack performance. Extensive experiments demonstrate the effectiveness of\nSLMIA-SR.\n","authors":["Guangke Chen","Yedi Zhang","Fu Song"],"pdf_url":"https://arxiv.org/pdf/2309.07983v2.pdf","comment":"In Proceedings of the 31st Network and Distributed System Security\n (NDSS) Symposium, 2024"},{"id":"http://arxiv.org/abs/2311.13770v2","updated":"2023-11-27T11:09:47Z","published":"2023-11-23T01:53:02Z","title":"Archiving Body Movements: Collective Generation of Chinese Calligraphy","summary":" As a communication channel, body movements have been widely explored in\nbehavioral studies and kinesics. Performing and visual arts share the same\ninterests but focus on documenting and representing human body movements, such\nas for dance notation and visual work creation. This paper investigates body\nmovements in oriental calligraphy and how to apply calligraphy principles to\nstimulate and archive body movements. Through an artwork (Wushu), the authors\nexperiment with an interactive and generative approach to engage the audience's\nbodily participation and archive the body movements as a compendium of\ngenerated calligraphy. The audience assumes the role of both writers and\nreaders; creating (\"writing\") and appreciating (\"reading\") the generated\ncalligraphy becomes a cyclical process within this infinite \"Book,\" which can\nmotivate further attention and discussions concerning Chinese characters and\ncalligraphy.\n","authors":["Aven Le Zhou","Jiayi Ye","Tianchen Liu","Kang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13770v2.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.15581v1","updated":"2023-11-27T07:19:10Z","published":"2023-11-27T07:19:10Z","title":"Real Time GAZED: Online Shot Selection and Editing of Virtual Cameras\n from Wide-Angle Monocular Video Recordings","summary":" Eliminating time-consuming post-production processes and delivering\nhigh-quality videos in today's fast-paced digital landscape are the key\nadvantages of real-time approaches. To address these needs, we present Real\nTime GAZED: a real-time adaptation of the GAZED framework integrated with\nCineFilter, a novel real-time camera trajectory stabilization approach. It\nenables users to create professionally edited videos in real-time. Comparative\nevaluations against baseline methods, including the non-real-time GAZED,\ndemonstrate that Real Time GAZED achieves similar editing results, ensuring\nhigh-quality video output. Furthermore, a user study confirms the aesthetic\nquality of the video edits produced by the Real Time GAZED approach. With these\nadvancements in real-time camera trajectory optimization and video editing\npresented, the demand for immediate and dynamic content creation in industries\nsuch as live broadcasting, sports coverage, news reporting, and social media\ncontent creation can be met more efficiently.\n","authors":["Sudheer Achary","Rohit Girmaji","Adhiraj Anil Deshmukh","Vineet Gandhi"],"pdf_url":"https://arxiv.org/pdf/2311.15581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15540v1","updated":"2023-11-27T05:10:15Z","published":"2023-11-27T05:10:15Z","title":"EAFP-Med: An Efficient Adaptive Feature Processing Module Based on\n Prompts for Medical Image Detection","summary":" In the face of rapid advances in medical imaging, cross-domain adaptive\nmedical image detection is challenging due to the differences in lesion\nrepresentations across various medical imaging technologies. To address this\nissue, we draw inspiration from large language models to propose EAFP-Med, an\nefficient adaptive feature processing module based on prompts for medical image\ndetection. EAFP-Med can efficiently extract lesion features of different scales\nfrom a diverse range of medical images based on prompts while being flexible\nand not limited by specific imaging techniques. Furthermore, it serves as a\nfeature preprocessing module that can be connected to any model front-end to\nenhance the lesion features in input images. Moreover, we propose a novel\nadaptive disease detection model named EAFP-Med ST, which utilizes the Swin\nTransformer V2 - Tiny (SwinV2-T) as its backbone and connects it to EAFP-Med.\nWe have compared our method to nine state-of-the-art methods. Experimental\nresults demonstrate that EAFP-Med ST achieves the best performance on all three\ndatasets (chest X-ray images, cranial magnetic resonance imaging images, and\nskin images). EAFP-Med can efficiently extract lesion features from various\nmedical images based on prompts, enhancing the model's performance. This holds\nsignificant potential for improving medical image analysis and diagnosis.\n","authors":["Xiang Li","Long Lan","Husam Lahza","Shaowu Yang","Shuihua Wang","Wenjing Yang","Hengzhu Liu","Yudong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15480v1","updated":"2023-11-27T01:44:02Z","published":"2023-11-27T01:44:02Z","title":"Automatic Time Signature Determination for New Scores Using Lyrics for\n Latent Rhythmic Structure","summary":" There has recently been a sharp increase in interest in Artificial\nIntelligence-Generated Content (AIGC). Despite this, musical components such as\ntime signatures have not been studied sufficiently to form an algorithmic\ndetermination approach for new compositions, especially lyrical songs. This is\nlikely because of the neglect of musical details, which is critical for\nconstructing a robust framework. Specifically, time signatures establish the\nfundamental rhythmic structure for almost all aspects of a song, including the\nphrases and notes. In this paper, we propose a novel approach that only uses\nlyrics as input to automatically generate a fitting time signature for lyrical\nsongs and uncover the latent rhythmic structure utilizing explainable machine\nlearning models. In particular, we devise multiple methods that are associated\nwith discovering lyrical patterns and creating new features that simultaneously\ncontain lyrical, rhythmic, and statistical information. In this approach, the\nbest of our experimental results reveal a 97.6% F1 score and a 0.996 Area Under\nthe Curve (AUC) of the Receiver Operating Characteristic (ROC) score. In\nconclusion, our research directly generates time signatures from lyrics\nautomatically for new scores utilizing machine learning, which is an innovative\nidea that approaches an understudied component of musicology and therefore\ncontributes significantly to the future of Artificial Intelligence (AI) music\ngeneration.\n","authors":["Callie C. Liao","Duoduo Liao","Jesse Guessford"],"pdf_url":"https://arxiv.org/pdf/2311.15480v1.pdf","comment":"Submitted to IEEE Big Data 2023 Conference"},{"id":"http://arxiv.org/abs/2311.16254v1","updated":"2023-11-27T19:02:17Z","published":"2023-11-27T19:02:17Z","title":"Removing NSFW Concepts from Vision-and-Language Models for Text-to-Image\n Retrieval and Generation","summary":" Vision-and-Language models such as CLIP have demonstrated remarkable\neffectiveness across a wide range of tasks. However, these models are typically\ntrained on web-scale data, which can introduce inappropriate content and lead\nto the development of unsafe and biased behavior. This, in turn, hampers their\napplicability in sensitive and trustworthy contexts and could raise significant\nconcern in their adoption. To overcome these limitations, we introduce a\nmethodology to make Vision-and-Language models safer by removing their\nsensitivity to not-safe-for-work concepts. We show how this can be done by\ndistilling from a large language model which converts between safe and unsafe\nsentences and which is fine-tuned starting from just 100 manually-curated\npairs. We conduct extensive experiments on the resulting embedding space for\nboth retrieval and text-to-image generation, where we show that our model can\nalso be properly employed with pre-trained image generators. Our source code\nand trained models are available at: https://github.com/aimagelab/safe-clip.\n","authors":["Samuele Poppi","Tobia Poppi","Federico Cocchi","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2311.16254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17072v1","updated":"2023-11-27T19:00:06Z","published":"2023-11-27T19:00:06Z","title":"IG Captioner: Information Gain Captioners are Strong Zero-shot\n Classifiers","summary":" Generative training has been demonstrated to be powerful for building\nvisual-language models. However, on zero-shot discriminative benchmarks, there\nis still a performance gap between models trained with generative and\ndiscriminative objectives. In this paper, we aim to narrow this gap by\nimproving the efficacy of generative training on classification tasks, without\nany finetuning processes or additional modules.\n Specifically, we focus on narrowing the gap between the generative captioner\nand the CLIP classifier. We begin by analysing the predictions made by the\ncaptioner and classifier and observe that the caption generation inherits the\ndistribution bias from the language model trained with pure text modality,\nmaking it less grounded on the visual signal. To tackle this problem, we\nredesign the scoring objective for the captioner to alleviate the\ndistributional bias and focus on measuring the gain of information brought by\nthe visual inputs. We further design a generative training objective to match\nthe evaluation objective. We name our model trained and evaluated from the\nnovel procedures as Information Gain (IG) captioner. We pretrain the models on\nthe public Laion-5B dataset and perform a series of discriminative evaluations.\nFor the zero-shot classification on ImageNet, IG captioner achieves $> 18\\%$\nimprovements over the standard captioner, achieving comparable performances\nwith the CLIP classifier. IG captioner also demonstrated strong performance on\nzero-shot image-text retrieval tasks on MSCOCO and Flickr30K. We hope this\npaper inspires further research towards unifying generative and discriminative\ntraining procedures for visual-language models.\n","authors":["Chenglin Yang","Siyuan Qiao","Yuan Cao","Yu Zhang","Tao Zhu","Alan Yuille","Jiahui Yu"],"pdf_url":"https://arxiv.org/pdf/2311.17072v1.pdf","comment":null}]},"2023-11-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.15451v1","updated":"2023-11-26T22:47:54Z","published":"2023-11-26T22:47:54Z","title":"Uncertainty-aware Language Modeling for Selective Question Answering","summary":" We present an automatic large language model (LLM) conversion approach that\nproduces uncertainty-aware LLMs capable of estimating uncertainty with every\nprediction. Our approach is model- and data-agnostic, is\ncomputationally-efficient, and does not rely on external models or systems. We\nevaluate converted models on the selective question answering setting -- to\nanswer as many questions as possible while maintaining a given accuracy,\nforgoing providing predictions when necessary. As part of our results, we test\nBERT and Llama 2 model variants on the SQuAD extractive QA task and the\nTruthfulQA generative QA task. We show that using the uncertainty estimates\nprovided by our approach to selectively answer questions leads to significantly\nhigher accuracy over directly using model probabilities.\n","authors":["Qi Yang","Shreya Ravikumar","Fynn Schmitt-Ulms","Satvik Lolla","Ege Demir","Iaroslav Elistratov","Alex Lavaee","Sadhana Lolla","Elaheh Ahmadi","Daniela Rus","Alexander Amini","Alejandro Perez"],"pdf_url":"https://arxiv.org/pdf/2311.15451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04947v2","updated":"2023-11-26T22:00:36Z","published":"2023-04-11T03:17:37Z","title":"Conditional Adapters: Parameter-efficient Transfer Learning with Fast\n Inference","summary":" We propose Conditional Adapter (CoDA), a parameter-efficient transfer\nlearning method that also improves inference efficiency. CoDA generalizes\nbeyond standard adapter approaches to enable a new way of balancing speed and\naccuracy using conditional computation. Starting with an existing dense\npretrained model, CoDA adds sparse activation together with a small number of\nnew parameters and a light-weight training phase. Our experiments demonstrate\nthat the CoDA approach provides an unexpectedly efficient way to transfer\nknowledge. Across a variety of language, vision, and speech tasks, CoDA\nachieves a 2x to 8x inference speed-up compared to the state-of-the-art Adapter\napproaches with moderate to no accuracy loss and the same parameter efficiency.\n","authors":["Tao Lei","Junwen Bai","Siddhartha Brahma","Joshua Ainslie","Kenton Lee","Yanqi Zhou","Nan Du","Vincent Y. Zhao","Yuexin Wu","Bo Li","Yu Zhang","Ming-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2304.04947v2.pdf","comment":"NeurIPS camera ready version"},{"id":"http://arxiv.org/abs/2311.15436v1","updated":"2023-11-26T21:45:53Z","published":"2023-11-26T21:45:53Z","title":"Learning to Skip for Language Modeling","summary":" Overparameterized large-scale language models have impressive generalization\nperformance of in-context few-shot learning. However, most language models\nallocate the same amount of parameters or computation to each token,\ndisregarding the complexity or importance of the input data. We argue that in\nlanguage model pretraining, a variable amount of computation should be assigned\nto different tokens, and this can be efficiently achieved via a simple routing\nmechanism. Different from conventional early stopping techniques where tokens\ncan early exit at only early layers, we propose a more general method that\ndynamically skips the execution of a layer (or module) for any input token with\na binary router. In our extensive evaluation across 24 NLP tasks, we\ndemonstrate that the proposed method can significantly improve the 1-shot\nperformance compared to other competitive baselines only at mild extra cost for\ninference.\n","authors":["Dewen Zeng","Nan Du","Tao Wang","Yuanzhong Xu","Tao Lei","Zhifeng Chen","Claire Cui"],"pdf_url":"https://arxiv.org/pdf/2311.15436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03081v2","updated":"2023-11-26T21:40:00Z","published":"2023-06-05T17:55:05Z","title":"Sequential Monte Carlo Steering of Large Language Models using\n Probabilistic Programs","summary":" Even after fine-tuning and reinforcement learning, large language models\n(LLMs) can be difficult, if not impossible, to control reliably with prompts\nalone. We propose a new inference-time approach to enforcing syntactic and\nsemantic constraints on the outputs of LLMs, called sequential Monte Carlo\n(SMC) steering. The key idea is to specify language generation tasks as\nposterior inference problems in a class of discrete probabilistic sequence\nmodels, and replace standard decoding with sequential Monte Carlo inference.\nFor a computational cost similar to that of beam search, SMC can steer LLMs to\nsolve diverse tasks, including infilling, generation under syntactic\nconstraints, and prompt intersection. To facilitate experimentation with SMC\nsteering, we present a probabilistic programming library, LLaMPPL\n(https://github.com/probcomp/hfppl), for concisely specifying new generation\ntasks as language model probabilistic programs, and automating steering of\nLLaMA-family Transformers.\n","authors":["Alexander K. Lew","Tan Zhi-Xuan","Gabriel Grand","Vikash K. Mansinghka"],"pdf_url":"https://arxiv.org/pdf/2306.03081v2.pdf","comment":"Minor typo fixes"},{"id":"http://arxiv.org/abs/2311.15425v1","updated":"2023-11-26T21:16:01Z","published":"2023-11-26T21:16:01Z","title":"Machine-Generated Text Detection using Deep Learning","summary":" Our research focuses on the crucial challenge of discerning text produced by\nLarge Language Models (LLMs) from human-generated text, which holds\nsignificance for various applications. With ongoing discussions about attaining\na model with such functionality, we present supporting evidence regarding the\nfeasibility of such models. We evaluated our models on multiple datasets,\nincluding Twitter Sentiment, Football Commentary, Project Gutenberg, PubMedQA,\nand SQuAD, confirming the efficacy of the enhanced detection approaches. These\ndatasets were sampled with intricate constraints encompassing every\npossibility, laying the foundation for future research. We evaluate\nGPT-3.5-Turbo against various detectors such as SVM, RoBERTa-base, and\nRoBERTa-large. Based on the research findings, the results predominantly relied\non the sequence length of the sentence.\n","authors":["Raghav Gaggar","Ashish Bhagchandani","Harsh Oza"],"pdf_url":"https://arxiv.org/pdf/2311.15425v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.15402v1","updated":"2023-11-26T19:56:19Z","published":"2023-11-26T19:56:19Z","title":"Learning Section Weights for Multi-Label Document Classification","summary":" Multi-label document classification is a traditional task in NLP. Compared to\nsingle-label classification, each document can be assigned multiple classes.\nThis problem is crucially important in various domains, such as tagging\nscientific articles. Documents are often structured into several sections such\nas abstract and title. Current approaches treat different sections equally for\nmulti-label classification. We argue that this is not a realistic assumption,\nleading to sub-optimal results. Instead, we propose a new method called\nLearning Section Weights (LSW), leveraging the contribution of each distinct\nsection for multi-label classification. Via multiple feed-forward layers, LSW\nlearns to assign weights to each section of, and incorporate the weights in the\nprediction. We demonstrate our approach on scientific articles. Experimental\nresults on public (arXiv) and private (Elsevier) datasets confirm the\nsuperiority of LSW, compared to state-of-the-art multi-label document\nclassification methods. In particular, LSW achieves a 1.3% improvement in terms\nof macro averaged F1-score while it achieves 1.3% in terms of macro averaged\nrecall on the publicly available arXiv dataset.\n","authors":["Maziar Moradi Fard","Paula Sorrolla Bayod","Kiomars Motarjem","Mohammad Alian Nejadi","Saber Akhondi","Camilo Thorne"],"pdf_url":"https://arxiv.org/pdf/2311.15402v1.pdf","comment":"7 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.14930v2","updated":"2023-11-26T18:36:30Z","published":"2023-05-24T09:13:15Z","title":"In-Context Impersonation Reveals Large Language Models' Strengths and\n Biases","summary":" In everyday conversations, humans can take on different roles and adapt their\nvocabulary to their chosen roles. We explore whether LLMs can take on, that is\nimpersonate, different roles when they generate text in-context. We ask LLMs to\nassume different personas before solving vision and language tasks. We do this\nby prefixing the prompt with a persona that is associated either with a social\nidentity or domain expertise. In a multi-armed bandit task, we find that LLMs\npretending to be children of different ages recover human-like developmental\nstages of exploration. In a language-based reasoning task, we find that LLMs\nimpersonating domain experts perform better than LLMs impersonating non-domain\nexperts. Finally, we test whether LLMs' impersonations are complementary to\nvisual information when describing different categories. We find that\nimpersonation can improve performance: an LLM prompted to be a bird expert\ndescribes birds better than one prompted to be a car expert. However,\nimpersonation can also uncover LLMs' biases: an LLM prompted to be a man\ndescribes cars better than one prompted to be a woman. These findings\ndemonstrate that LLMs are capable of taking on diverse roles and that this\nin-context impersonation can be used to uncover their hidden strengths and\nbiases.\n","authors":["Leonard Salewski","Stephan Alaniz","Isabel Rio-Torto","Eric Schulz","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2305.14930v2.pdf","comment":"Published in NeurIPS 2023 (Spotlight)"},{"id":"http://arxiv.org/abs/2212.06373v7","updated":"2023-11-26T17:24:19Z","published":"2022-12-13T05:12:40Z","title":"InferEM: Inferring the Speaker's Intention for Empathetic Dialogue\n Generation","summary":" Current approaches to empathetic response generation typically encode the\nentire dialogue history directly and put the output into a decoder to generate\nfriendly feedback. These methods focus on modelling contextual information but\nneglect capturing the direct intention of the speaker. We argue that the last\nutterance in the dialogue empirically conveys the intention of the speaker.\nConsequently, we propose a novel model named InferEM for empathetic response\ngeneration. We separately encode the last utterance and fuse it with the entire\ndialogue through the multi-head attention based intention fusion module to\ncapture the speaker's intention. Besides, we utilize previous utterances to\npredict the last utterance, which simulates human's psychology to guess what\nthe interlocutor may speak in advance. To balance the optimizing rates of the\nutterance prediction and response generation, a multi-task learning strategy is\ndesigned for InferEM. Experimental results demonstrate the plausibility and\nvalidity of InferEM in improving empathetic expression.\n","authors":["Guoqing Lv","Jiang Li","Xiaoping Wang","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2212.06373v7.pdf","comment":"Accepted by the 45th Annual Meeting of the Cognitive Science Society\n (CogSci 2023)"},{"id":"http://arxiv.org/abs/2311.05419v2","updated":"2023-11-26T17:12:20Z","published":"2023-11-09T14:58:46Z","title":"Mirror: A Universal Framework for Various Information Extraction Tasks","summary":" Sharing knowledge between information extraction tasks has always been a\nchallenge due to the diverse data formats and task variations. Meanwhile, this\ndivergence leads to information waste and increases difficulties in building\ncomplex applications in real scenarios. Recent studies often formulate IE tasks\nas a triplet extraction problem. However, such a paradigm does not support\nmulti-span and n-ary extraction, leading to weak versatility. To this end, we\nreorganize IE problems into unified multi-slot tuples and propose a universal\nframework for various IE tasks, namely Mirror. Specifically, we recast existing\nIE tasks as a multi-span cyclic graph extraction problem and devise a\nnon-autoregressive graph decoding algorithm to extract all spans in a single\nstep. It is worth noting that this graph structure is incredibly versatile, and\nit supports not only complex IE tasks, but also machine reading comprehension\nand classification tasks. We manually construct a corpus containing 57 datasets\nfor model pretraining, and conduct experiments on 30 datasets across 8\ndownstream tasks. The experimental results demonstrate that our model has\ndecent compatibility and outperforms or reaches competitive performance with\nSOTA systems under few-shot and zero-shot settings. The code, model weights,\nand pretraining corpus are available at https://github.com/Spico197/Mirror .\n","authors":["Tong Zhu","Junfei Ren","Zijian Yu","Mengsong Wu","Guoliang Zhang","Xiaoye Qu","Wenliang Chen","Zhefeng Wang","Baoxing Huai","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.05419v2.pdf","comment":"Accepted to EMNLP23 main conference"},{"id":"http://arxiv.org/abs/2305.09620v2","updated":"2023-11-26T16:25:49Z","published":"2023-05-16T17:13:07Z","title":"AI-Augmented Surveys: Leveraging Large Language Models and Surveys for\n Opinion Prediction","summary":" Large language models (LLMs) that produce human-like responses have begun to\nrevolutionize research practices in the social sciences. This paper shows how\nwe can integrate LLMs and social surveys to accurately predict individual\nresponses to survey questions that were not asked before. We develop a novel\nmethodological framework to personalize LLMs by considering the meaning of\nsurvey questions derived from their text, the latent beliefs of individuals\ninferred from their response patterns, and the temporal contexts across\ndifferent survey periods through fine-tuning LLMs with survey data. Using the\nGeneral Social Survey from 1972 to 2021, we show that the fine-tuned model\nbased on Alpaca-7b can predict individual responses to survey questions that\nare partially missing as well as entirely missing. The remarkable prediction\ncapabilities allow us to fill in missing trends with high confidence and\npinpoint when public attitudes changed, such as the rising support for same-sex\nmarriage. We discuss practical constraints, socio-demographic representation,\nand ethical concerns regarding individual autonomy and privacy when using LLMs\nfor opinion prediction. This study demonstrates that LLMs and surveys can\nmutually enhance each other's capabilities: LLMs broaden survey potential,\nwhile surveys improve the alignment of LLMs.\n","authors":["Junsol Kim","Byungkyu Lee"],"pdf_url":"https://arxiv.org/pdf/2305.09620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11435v2","updated":"2023-11-26T15:05:58Z","published":"2023-11-19T22:14:48Z","title":"Unveiling Public Perceptions: Machine Learning-Based Sentiment Analysis\n of COVID-19 Vaccines in India","summary":" In March 2020, the World Health Organisation declared COVID-19 a global\npandemic as it spread to nearly every country. By mid-2021, India had\nintroduced three vaccines: Covishield, Covaxin, and Sputnik. To ensure\nsuccessful vaccination in a densely populated country like India, understanding\npublic sentiment was crucial. Social media, particularly Reddit with over 430\nmillion users, played a vital role in disseminating information. This study\nemploys data mining techniques to analyze Reddit data and gauge Indian\nsentiments towards COVID-19 vaccines. Using Python's Text Blob library,\ncomments are annotated to assess general sentiments. Results show that most\nReddit users in India expressed neutrality about vaccination, posing a\nchallenge for the Indian government's efforts to vaccinate a significant\nportion of the population.\n","authors":["Milind Gupta","Abhishek Kaushik"],"pdf_url":"https://arxiv.org/pdf/2311.11435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15316v1","updated":"2023-11-26T14:35:23Z","published":"2023-11-26T14:35:23Z","title":"Enhancing Empathetic and Emotion Support Dialogue Generation with\n Prophetic Commonsense Inference","summary":" The interest in Empathetic and Emotional Support conversations among the\npublic has significantly increased. To offer more sensitive and understanding\nresponses, leveraging commonsense knowledge has become a common strategy to\nbetter understand psychological aspects and causality. However, such\ncommonsense inferences can be out of context and unable to predict upcoming\ndialogue themes, resulting in responses that lack coherence and empathy. To\nremedy this issue, we present Prophetic Commonsense Inference, an innovative\nparadigm for inferring commonsense knowledge. By harnessing the capabilities of\nLarge Language Models in understanding dialogue and making commonsense\ndeductions, we train tunable models to bridge the gap between past and\npotential future dialogues. Extensive experiments conducted on\nEmpatheticDialogues and Emotion Support Conversation show that equipping\ndialogue agents with our proposed prophetic commonsense inference significantly\nenhances the quality of their responses.\n","authors":["Lanrui Wang","Jiangnan Li","Chenxu Yang","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15296v1","updated":"2023-11-26T13:42:56Z","published":"2023-11-26T13:42:56Z","title":"UHGEval: Benchmarking the Hallucination of Chinese Large Language Models\n via Unconstrained Generation","summary":" Large language models (LLMs) have emerged as pivotal contributors in\ncontemporary natural language processing and are increasingly being applied\nacross a diverse range of industries. However, these large-scale probabilistic\nstatistical models cannot currently ensure the requisite quality in\nprofessional content generation. These models often produce hallucinated text,\ncompromising their practical utility in professional contexts. To assess the\nauthentic reliability of LLMs in text generation, numerous initiatives have\ndeveloped benchmark evaluations for hallucination phenomena. Nevertheless,\nthese benchmarks frequently utilize constrained generation techniques due to\ncost and temporal constraints. These techniques encompass the use of directed\nhallucination induction and strategies that deliberately alter authentic text\nto produce hallucinations. These approaches are not congruent with the\nunrestricted text generation demanded by real-world applications. Furthermore,\na well-established Chinese-language dataset dedicated to the evaluation of\nhallucinations in text generation is presently lacking. Consequently, we have\ndeveloped an Unconstrained Hallucination Generation Evaluation (UHGEval)\nbenchmark, designed to compile outputs produced with minimal restrictions by\nLLMs. Concurrently, we have established a comprehensive benchmark evaluation\nframework to aid subsequent researchers in undertaking scalable and\nreproducible experiments. We have also executed extensive experiments,\nevaluating prominent Chinese language models and the GPT series models to\nderive professional performance insights regarding hallucination challenges.\n","authors":["Xun Liang","Shichao Song","Simin Niu","Zhiyu Li","Feiyu Xiong","Bo Tang","Zhaohui Wy","Dawei He","Peng Cheng","Zhonghao Wang","Haiying Deng"],"pdf_url":"https://arxiv.org/pdf/2311.15296v1.pdf","comment":"13 Pages, submitted to ICDE2024"},{"id":"http://arxiv.org/abs/2307.11845v2","updated":"2023-11-26T08:57:44Z","published":"2023-07-21T18:29:04Z","title":"Multimodal Document Analytics for Banking Process Automation","summary":" Traditional banks face increasing competition from FinTechs in the rapidly\nevolving financial ecosystem. Raising operational efficiency is vital to\naddress this challenge. Our study aims to improve the efficiency of\ndocument-intensive business processes in banking. To that end, we first review\nthe landscape of business documents in the retail segment. Banking documents\noften contain text, layout, and visuals, suggesting that document analytics and\nprocess automation require more than plain natural language processing (NLP).\nTo verify this and assess the incremental value of visual cues when processing\nbusiness documents, we compare a recently proposed multimodal model called\nLayoutXLM to powerful text classifiers (e.g., BERT) and large language models\n(e.g., GPT) in a case study related to processing company register extracts.\nThe results confirm that incorporating layout information in a model\nsubstantially increases its performance. Interestingly, we also observed that\nmore than 75% of the best model performance (in terms of the F1 score) can be\nachieved with as little as 30% of the training data. This shows that the demand\nfor data labeled data to set up a multi-modal model can be moderate, which\nsimplifies real-world applications of multimodal document analytics. Our study\nalso sheds light on more specific practices in the scope of calibrating a\nmultimodal banking document classifier, including the need for fine-tuning. In\nsum, the paper contributes original empirical evidence on the effectiveness and\nefficiency of multi-model models for document processing in the banking\nbusiness and offers practical guidance on how to unlock this potential in\nday-to-day operations.\n","authors":["Christopher Gerling","Stefan Lessmann"],"pdf_url":"https://arxiv.org/pdf/2307.11845v2.pdf","comment":"A Preprint"},{"id":"http://arxiv.org/abs/2311.15218v1","updated":"2023-11-26T07:19:10Z","published":"2023-11-26T07:19:10Z","title":"Dataset for Stock Market Forecasting Based on Quantitative Analysis and\n Qualitative Data","summary":" The application of Machine learning to finance has become a familiar\napproach, even more so in stock market forecasting. The stock market is highly\nvolatile and huge amounts of data are generated every minute globally. The\nextraction of effective intelligence from this data is of critical importance.\nHowever, a collaboration of numerical stock data with qualitative text data can\nbe a challenging task. In this work, we accomplish this and provide an\nunprecedented, publicly available dataset with technical and fundamental data,\nsentiment that we gathered from News Archives, TV news captions, Radio\nTranscripts, Tweets, Daily financial newspapers, etc. The text data entries\nused for sentiment extraction total more than 1.4 Million. The dataset\ncomprises of daily entries from January 2018 to December 2022 for 8 different\ncompanies and Dow Jones Index as a whole. Holistic Fundamental and Technical\ndata is provided training ready for Model learning and deployment. The\npredictive power of deep learning models is highly determined by the training\ndata provided. This dataset would be of benefit for research globally\nincorporating qualitative intelligence for stock market forecasting. The\ndataset is made available at https://github.com/batking24/Huge-Stock-Dataset.\n","authors":["Sai Akash Bathini","Dagli Cihan"],"pdf_url":"https://arxiv.org/pdf/2311.15218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15211v1","updated":"2023-11-26T06:56:02Z","published":"2023-11-26T06:56:02Z","title":"Probabilistic Transformer: A Probabilistic Dependency Model for\n Contextual Word Representation","summary":" Syntactic structures used to play a vital role in natural language processing\n(NLP), but since the deep learning revolution, NLP has been gradually dominated\nby neural models that do not consider syntactic structures in their design. One\nvastly successful class of neural models is transformers. When used as an\nencoder, a transformer produces contextual representation of words in the input\nsentence. In this work, we propose a new model of contextual word\nrepresentation, not from a neural perspective, but from a purely syntactic and\nprobabilistic perspective. Specifically, we design a conditional random field\nthat models discrete latent representations of all words in a sentence as well\nas dependency arcs between them; and we use mean field variational inference\nfor approximate inference. Strikingly, we find that the computation graph of\nour model resembles transformers, with correspondences between dependencies and\nself-attention and between distributions over latent representations and\ncontextual embeddings of words. Experiments show that our model performs\ncompetitively to transformers on small to medium sized datasets. We hope that\nour work could help bridge the gap between traditional syntactic and\nprobabilistic approaches and cutting-edge neural approaches to NLP, and inspire\nmore linguistically-principled neural approaches in the future.\n","authors":["Haoyi Wu","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2311.15211v1.pdf","comment":"Accepted to ACL2023 Findings"},{"id":"http://arxiv.org/abs/2311.15208v1","updated":"2023-11-26T06:24:25Z","published":"2023-11-26T06:24:25Z","title":"LongStory: Coherent, Complete and Length Controlled Long story\n Generation","summary":" A human author can write any length of story without losing coherence. Also,\nthey always bring the story to a proper ending, an ability that current\nlanguage models lack. In this work, we present the LongStory for coherent,\ncomplete, and length-controlled long story generation. LongStory introduces two\nnovel methodologies: (1) the long and short-term contexts weight calibrator\n(CWC) and (2) long story structural positions (LSP). The CWC adjusts weights\nfor long-term context Memory and short-term context Cheating, acknowledging\ntheir distinct roles. The LSP employs discourse tokens to convey the structural\npositions of a long story. Trained on three datasets with varied average story\nlengths, LongStory outperforms other baselines, including the strong story\ngenerator Plotmachine, in coherence, completeness, relevance, and\nrepetitiveness. We also perform zero-shot tests on each dataset to assess the\nmodel's ability to predict outcomes beyond its training data and validate our\nmethodology by comparing its performance with variants of our model.\n","authors":["Kyeongman Park","Nakyeong Yang","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2311.15208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15198v1","updated":"2023-11-26T05:34:22Z","published":"2023-11-26T05:34:22Z","title":"ChatGPT and Beyond: The Generative AI Revolution in Education","summary":" The wide adoption and usage of generative artificial intelligence (AI)\nmodels, particularly ChatGPT, has sparked a surge in research exploring their\npotential applications in the educational landscape. This survey examines\nacademic literature published between November, 2022, and July, 2023,\nspecifically targeting high-impact research from Scopus-indexed Q1 and Q2\njournals. This survey delves into the practical applications and implications\nof generative AI models across a diverse range of educational contexts. Through\na comprehensive and rigorous evaluation of recent academic literature, this\nsurvey seeks to illuminate the evolving role of generative AI models,\nparticularly ChatGPT, in education. By shedding light on the potential\nbenefits, challenges, and emerging trends in this dynamic field, the survey\nendeavors to contribute to the understanding of the nexus between artificial\nintelligence and education. The findings of this review will empower educators,\nresearchers, and policymakers to make informed decisions about the integration\nof AI technologies into learning environments.\n","authors":["Mohammad AL-Smadi"],"pdf_url":"https://arxiv.org/pdf/2311.15198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10013v2","updated":"2023-11-26T05:05:51Z","published":"2022-12-20T06:01:13Z","title":"DocAsRef: An Empirical Study on Repurposing Reference-Based Summary\n Quality Metrics Reference-Freely","summary":" Automated summary quality assessment falls into two categories:\nreference-based and reference-free. Reference-based metrics, historically\ndeemed more accurate due to the additional information provided by\nhuman-written references, are limited by their reliance on human input. In this\npaper, we hypothesize that the comparison methodologies used by some\nreference-based metrics to evaluate a system summary against its corresponding\nreference can be effectively adapted to assess it against its source document,\nthereby transforming these metrics into reference-free ones. Experimental\nresults support this hypothesis. After being repurposed reference-freely, the\nzero-shot BERTScore using the pretrained DeBERTa-large-MNLI model of <0.5B\nparameters consistently outperforms its original reference-based version across\nvarious aspects on the SummEval and Newsroom datasets. It also excels in\ncomparison to most existing reference-free metrics and closely competes with\nzero-shot summary evaluators based on GPT-3.5.\n","authors":["Forrest Sheng Bao","Ruixuan Tu","Ge Luo","Yinfei Yang","Hebi Li","Minghui Qiu","Youbiao He","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2212.10013v2.pdf","comment":"Accepted into Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.15180v1","updated":"2023-11-26T03:54:03Z","published":"2023-11-26T03:54:03Z","title":"Benchmarking Large Language Model Volatility","summary":" The impact of non-deterministic outputs from Large Language Models (LLMs) is\nnot well examined for financial text understanding tasks. Through a compelling\ncase study on investing in the US equity market via news sentiment analysis, we\nuncover substantial variability in sentence-level sentiment classification\nresults, underscoring the innate volatility of LLM outputs. These uncertainties\ncascade downstream, leading to more significant variations in portfolio\nconstruction and return. While tweaking the temperature parameter in the\nlanguage model decoder presents a potential remedy, it comes at the expense of\nstifled creativity. Similarly, while ensembling multiple outputs mitigates the\neffect of volatile outputs, it demands a notable computational investment. This\nwork furnishes practitioners with invaluable insights for adeptly navigating\nuncertainty in the integration of LLMs into financial decision-making,\nparticularly in scenarios dictated by non-deterministic information.\n","authors":["Boyang Yu"],"pdf_url":"https://arxiv.org/pdf/2311.15180v1.pdf","comment":"7 pages, 2 figures, Workshop on AI Safety and Robustness In Finance,\n ICAIF 2023"},{"id":"http://arxiv.org/abs/2311.06401v3","updated":"2023-11-26T02:42:13Z","published":"2023-11-10T21:32:34Z","title":"Autoregressive Language Models For Estimating the Entropy of Epic EHR\n Audit Logs","summary":" EHR audit logs are a highly granular stream of events that capture clinician\nactivities, and is a significant area of interest for research in\ncharacterizing clinician workflow on the electronic health record (EHR).\nExisting techniques to measure the complexity of workflow through EHR audit\nlogs (audit logs) involve time- or frequency-based cross-sectional aggregations\nthat are unable to capture the full complexity of a EHR session. We briefly\nevaluate the usage of transformer-based tabular language model (tabular LM) in\nmeasuring the entropy or disorderedness of action sequences within workflow and\nrelease the evaluated models publicly.\n","authors":["Benjamin C. Warner","Thomas Kannampallil","Seunghwan Kim"],"pdf_url":"https://arxiv.org/pdf/2311.06401v3.pdf","comment":"Extended Abstract presented at Machine Learning for Health (ML4H)\n symposium 2023, December 10th, 2023, New Orleans, United States, 10 pages"},{"id":"http://arxiv.org/abs/2206.10498v4","updated":"2023-11-26T01:15:41Z","published":"2022-06-21T16:15:27Z","title":"PlanBench: An Extensible Benchmark for Evaluating Large Language Models\n on Planning and Reasoning about Change","summary":" Generating plans of action, and reasoning about change have long been\nconsidered a core competence of intelligent agents. It is thus no surprise that\nevaluating the planning and reasoning capabilities of large language models\n(LLMs) has become a hot topic of research. Most claims about LLM planning\ncapabilities are however based on common sense tasks-where it becomes hard to\ntell whether LLMs are planning or merely retrieving from their vast world\nknowledge. There is a strong need for systematic and extensible planning\nbenchmarks with sufficient diversity to evaluate whether LLMs have innate\nplanning capabilities. Motivated by this, we propose PlanBench, an extensible\nbenchmark suite based on the kinds of domains used in the automated planning\ncommunity, especially in the International Planning Competition, to test the\ncapabilities of LLMs in planning or reasoning about actions and change.\nPlanBench provides sufficient diversity in both the task domains and the\nspecific planning capabilities. Our studies also show that on many critical\ncapabilities-including plan generation-LLM performance falls quite short, even\nwith the SOTA models. PlanBench can thus function as a useful marker of\nprogress of LLMs in planning and reasoning.\n","authors":["Karthik Valmeekam","Matthew Marquez","Alberto Olmo","Sarath Sreedharan","Subbarao Kambhampati"],"pdf_url":"https://arxiv.org/pdf/2206.10498v4.pdf","comment":"NeurIPS 2023 Track on Datasets and Benchmarks"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2304.08138v2","updated":"2023-11-26T23:52:43Z","published":"2023-04-17T10:42:30Z","title":"Typos-aware Bottlenecked Pre-Training for Robust Dense Retrieval","summary":" Current dense retrievers (DRs) are limited in their ability to effectively\nprocess misspelled queries, which constitute a significant portion of query\ntraffic in commercial search engines. The main issue is that the pre-trained\nlanguage model-based encoders used by DRs are typically trained and fine-tuned\nusing clean, well-curated text data. Misspelled queries are typically not found\nin the data used for training these models, and thus misspelled queries\nobserved at inference time are out-of-distribution compared to the data used\nfor training and fine-tuning. Previous efforts to address this issue have\nfocused on \\textit{fine-tuning} strategies, but their effectiveness on\nmisspelled queries remains lower than that of pipelines that employ separate\nstate-of-the-art spell-checking components. To address this challenge, we\npropose ToRoDer (TypOs-aware bottlenecked pre-training for RObust DEnse\nRetrieval), a novel re-training strategy for DRs that increases their\nrobustness to misspelled queries while preserving their effectiveness in\ndownstream retrieval tasks. ToRoDer utilizes an encoder-decoder architecture\nwhere the encoder takes misspelled text with masked tokens as input and outputs\nbottlenecked information to the decoder. The decoder then takes as input the\nbottlenecked embeddings, along with token embeddings of the original text with\nthe misspelled tokens masked out. The pre-training task is to recover the\nmasked tokens for both the encoder and decoder. Our extensive experimental\nresults and detailed ablation studies show that DRs pre-trained with ToRoDer\nexhibit significantly higher effectiveness on misspelled queries, sensibly\nclosing the gap with pipelines that use a separate, complex spell-checker\ncomponent, while retaining their effectiveness on correctly spelled queries.\n","authors":["Shengyao Zhuang","Linjun Shou","Jian Pei","Ming Gong","Houxing Ren","Guido Zuccon","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2304.08138v2.pdf","comment":"10 pages, accepted at SIGIR-AP"},{"id":"http://arxiv.org/abs/2311.15426v1","updated":"2023-11-26T21:16:12Z","published":"2023-11-26T21:16:12Z","title":"Data Augmentation for Sample Efficient and Robust Document Ranking","summary":" Contextual ranking models have delivered impressive performance improvements\nover classical models in the document ranking task. However, these highly\nover-parameterized models tend to be data-hungry and require large amounts of\ndata even for fine-tuning. In this paper, we propose data-augmentation methods\nfor effective and robust ranking performance. One of the key benefits of using\ndata augmentation is in achieving sample efficiency or learning effectively\nwhen we have only a small amount of training data. We propose supervised and\nunsupervised data augmentation schemes by creating training data using parts of\nthe relevant documents in the query-document pairs. We then adapt a family of\ncontrastive losses for the document ranking task that can exploit the augmented\ndata to learn an effective ranking model. Our extensive experiments on subsets\nof the MS MARCO and TREC-DL test sets show that data augmentation, along with\nthe ranking-adapted contrastive losses, results in performance improvements\nunder most dataset sizes. Apart from sample efficiency, we conclusively show\nthat data augmentation results in robust models when transferred to\nout-of-domain benchmarks. Our performance improvements in in-domain and more\nprominently in out-of-domain benchmarks show that augmentation regularizes the\nranking model and improves its robustness and generalization capability.\n","authors":["Abhijit Anand","Jurek Leonhardt","Jaspreet Singh","Koustav Rudra","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2311.15426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19453v2","updated":"2023-11-26T12:40:27Z","published":"2023-10-30T11:25:03Z","title":"FLIP: Towards Fine-grained Alignment between ID-based Models and\n Pretrained Language Models for CTR Prediction","summary":" Click-through rate (CTR) prediction plays as a core function module in\nvarious personalized online services. The traditional ID-based models for CTR\nprediction take as inputs the one-hot encoded ID features of tabular modality,\nwhich capture the collaborative signals via feature interaction modeling. But\nthe one-hot encoding discards the semantic information conceived in the\noriginal feature texts. Recently, the emergence of Pretrained Language Models\n(PLMs) has given rise to another paradigm, which takes as inputs the sentences\nof textual modality obtained by hard prompt templates and adopts PLMs to\nextract the semantic knowledge. However, PLMs generally tokenize the input text\ndata into subword tokens and ignore field-wise collaborative signals.\nTherefore, these two lines of research focus on different characteristics of\nthe same input data (i.e., textual and tabular modalities), forming a distinct\ncomplementary relationship with each other. In this paper, we propose to\nconduct Fine-grained feature-level ALignment between ID-based Models and\nPretrained Language Models (FLIP) for CTR prediction. We design a novel joint\nreconstruction pretraining task for both masked language and tabular modeling.\nSpecifically, the masked data of one modality (i.e., tokens or features) has to\nbe recovered with the help of the other modality, which establishes the\nfeature-level interaction and alignment via sufficient mutual information\nextraction between dual modalities. Moreover, we propose to jointly finetune\nthe ID-based model and PLM for downstream CTR prediction tasks, thus achieving\nsuperior performance by combining the advantages of both models. Extensive\nexperiments on three real-world datasets demonstrate that FLIP outperforms SOTA\nbaselines, and is highly compatible for various ID-based models and PLMs.\n","authors":["Hangyu Wang","Jianghao Lin","Xiangyang Li","Bo Chen","Chenxu Zhu","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2310.19453v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2308.10778v2","updated":"2023-11-26T11:32:59Z","published":"2023-08-21T15:09:19Z","title":"A Topology-aware Analysis of Graph Collaborative Filtering","summary":" The successful integration of graph neural networks into recommender systems\n(RSs) has led to a novel paradigm in collaborative filtering (CF), graph\ncollaborative filtering (graph CF). By representing user-item data as an\nundirected, bipartite graph, graph CF utilizes short- and long-range\nconnections to extract collaborative signals that yield more accurate user\npreferences than traditional CF methods. Although the recent literature\nhighlights the efficacy of various algorithmic strategies in graph CF, the\nimpact of datasets and their topological features on recommendation performance\nis yet to be studied. To fill this gap, we propose a topology-aware analysis of\ngraph CF. In this study, we (i) take some widely-adopted recommendation\ndatasets and use them to generate a large set of synthetic sub-datasets through\ntwo state-of-the-art graph sampling methods, (ii) measure eleven of their\nclassical and topological characteristics, and (iii) estimate the accuracy\ncalculated on the generated sub-datasets considering four popular and recent\ngraph-based RSs (i.e., LightGCN, DGCF, UltraGCN, and SVD-GCN). Finally, the\ninvestigation presents an explanatory framework that reveals the linear\nrelationships between characteristics and accuracy measures. The results,\nstatistically validated under different graph sampling settings, confirm the\nexistence of solid dependencies between topological characteristics and\naccuracy in the graph-based recommendation, offering a new perspective on how\nto interpret graph CF.\n","authors":["Daniele Malitesta","Claudio Pomo","Vito Walter Anelli","Alberto Carlo Maria Mancino","Eugenio Di Sciascio","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2308.10778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14742v1","updated":"2023-11-26T07:34:18Z","published":"2023-11-26T07:34:18Z","title":"Query-LIFE: Query-aware Language Image Fusion Embedding for E-Commerce\n Relevance","summary":" Relevance module plays a fundamental role in e-commerce search as they are\nresponsible for selecting relevant products from thousands of items based on\nuser queries, thereby enhancing users experience and efficiency. The\ntraditional approach models the relevance based product titles and queries, but\nthe information in titles alone maybe insufficient to describe the products\ncompletely. A more general optimization approach is to further leverage product\nimage information. In recent years, vision-language pre-training models have\nachieved impressive results in many scenarios, which leverage contrastive\nlearning to map both textual and visual features into a joint embedding space.\nIn e-commerce, a common practice is to fine-tune on the pre-trained model based\non e-commerce data. However, the performance is sub-optimal because the\nvision-language pre-training models lack of alignment specifically designed for\nqueries. In this paper, we propose a method called Query-LIFE (Query-aware\nLanguage Image Fusion Embedding) to address these challenges. Query-LIFE\nutilizes a query-based multimodal fusion to effectively incorporate the image\nand title based on the product types. Additionally, it employs query-aware\nmodal alignment to enhance the accuracy of the comprehensive representation of\nproducts. Furthermore, we design GenFilt, which utilizes the generation\ncapability of large models to filter out false negative samples and further\nimprove the overall performance of the contrastive learning task in the model.\nExperiments have demonstrated that Query-LIFE outperforms existing baselines.\nWe have conducted ablation studies and human evaluations to validate the\neffectiveness of each module within Query-LIFE. Moreover, Query-LIFE has been\ndeployed on Miravia Search, resulting in improved both relevance and conversion\nefficiency.\n","authors":["Hai Zhu","Yuankai Guo","Ronggang Dou","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2311.14742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v2","updated":"2023-11-26T15:06:49Z","published":"2023-07-23T11:50:27Z","title":"Unsupervised Image Outlier Detection using RANSAC","summary":" Image outlier detection (OD) is an essential tool to ensure the quality and\naccuracy of image datasets used in computer vision tasks. Most existing\napproaches, however, require a set of in-distribution data for training prior\nto outlier prediction. The quality and quantity of the data can influence the\nresulting performance. Thus, selecting a suitable in-distribution set often\nrequires considerable effort. In this work, we propose RANSAC-NN, an\nunsupervised image OD algorithm designed to detect outliers within contaminated\nsets in a one-class classification fashion. Without any training, RANSAC-NN\nperforms favorably in comparison to other well-established methods in a variety\nof OD benchmarks. Furthermore, we show that our method can enhance the\nrobustness of existing OD methods by simply applying RANSAC-NN during\npre-processing.\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2302.05543v3","updated":"2023-11-26T22:26:12Z","published":"2023-02-10T23:12:37Z","title":"Adding Conditional Control to Text-to-Image Diffusion Models","summary":" We present ControlNet, a neural network architecture to add spatial\nconditioning controls to large, pretrained text-to-image diffusion models.\nControlNet locks the production-ready large diffusion models, and reuses their\ndeep and robust encoding layers pretrained with billions of images as a strong\nbackbone to learn a diverse set of conditional controls. The neural\narchitecture is connected with \"zero convolutions\" (zero-initialized\nconvolution layers) that progressively grow the parameters from zero and ensure\nthat no harmful noise could affect the finetuning. We test various conditioning\ncontrols, eg, edges, depth, segmentation, human pose, etc, with Stable\nDiffusion, using single or multiple conditions, with or without prompts. We\nshow that the training of ControlNets is robust with small (<50k) and large\n(>1m) datasets. Extensive results show that ControlNet may facilitate wider\napplications to control image diffusion models.\n","authors":["Lvmin Zhang","Anyi Rao","Maneesh Agrawala"],"pdf_url":"https://arxiv.org/pdf/2302.05543v3.pdf","comment":"Codes and Supplementary Material:\n https://github.com/lllyasviel/ControlNet"},{"id":"http://arxiv.org/abs/2307.14335v2","updated":"2023-11-26T14:12:37Z","published":"2023-07-26T17:54:04Z","title":"WavJourney: Compositional Audio Creation with Large Language Models","summary":" Despite breakthroughs in audio generation models, their capabilities are\noften confined to domain-specific conditions such as speech transcriptions and\naudio captions. However, real-world audio creation aims to generate harmonious\naudio containing various elements such as speech, music, and sound effects with\ncontrollable conditions, which is challenging to address using existing audio\ngeneration systems. We present WavJourney, a novel framework that leverages\nLarge Language Models (LLMs) to connect various audio models for audio\ncreation. WavJourney allows users to create storytelling audio content with\ndiverse audio elements simply from textual descriptions. Specifically, given a\ntext instruction, WavJourney first prompts LLMs to generate an audio script\nthat serves as a structured semantic representation of audio elements. The\naudio script is then converted into a computer program, where each line of the\nprogram calls a task-specific audio generation model or computational operation\nfunction. The computer program is then executed to obtain a compositional and\ninterpretable solution for audio creation. Experimental results suggest that\nWavJourney is capable of synthesizing realistic audio aligned with\ntextually-described semantic, spatial and temporal conditions, achieving\nstate-of-the-art results on text-to-audio generation benchmarks. Additionally,\nwe introduce a new multi-genre story benchmark. Subjective evaluations\ndemonstrate the potential of WavJourney in crafting engaging storytelling audio\ncontent from text. We further demonstrate that WavJourney can facilitate\nhuman-machine co-creation in multi-round dialogues. To foster future research,\nthe code and synthesized audio are available at:\nhttps://audio-agi.github.io/WavJourney_demopage/.\n","authors":["Xubo Liu","Zhongkai Zhu","Haohe Liu","Yi Yuan","Meng Cui","Qiushi Huang","Jinhua Liang","Yin Cao","Qiuqiang Kong","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14335v2.pdf","comment":"GitHub: https://github.com/Audio-AGI/WavJourney"},{"id":"http://arxiv.org/abs/2311.15230v1","updated":"2023-11-26T08:04:43Z","published":"2023-11-26T08:04:43Z","title":"GAIA: Zero-shot Talking Avatar Generation","summary":" Zero-shot talking avatar generation aims at synthesizing natural talking\nvideos from speech and a single portrait image. Previous methods have relied on\ndomain-specific heuristics such as warping-based motion representation and 3D\nMorphable Models, which limit the naturalness and diversity of the generated\navatars. In this work, we introduce GAIA (Generative AI for Avatar), which\neliminates the domain priors in talking avatar generation. In light of the\nobservation that the speech only drives the motion of the avatar while the\nappearance of the avatar and the background typically remain the same\nthroughout the entire video, we divide our approach into two stages: 1)\ndisentangling each frame into motion and appearance representations; 2)\ngenerating motion sequences conditioned on the speech and reference portrait\nimage. We collect a large-scale high-quality talking avatar dataset and train\nthe model on it with different scales (up to 2B parameters). Experimental\nresults verify the superiority, scalability, and flexibility of GAIA as 1) the\nresulting model beats previous baseline models in terms of naturalness,\ndiversity, lip-sync quality, and visual quality; 2) the framework is scalable\nsince larger models yield better results; 3) it is general and enables\ndifferent applications like controllable talking avatar generation and\ntext-instructed avatar generation.\n","authors":["Tianyu He","Junliang Guo","Runyi Yu","Yuchi Wang","Jialiang Zhu","Kaikai An","Leyi Li","Xu Tan","Chunyu Wang","Han Hu","HsiangTao Wu","Sheng Zhao","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2311.15230v1.pdf","comment":"Project page: https://microsoft.github.io/GAIA/"}]},"2023-11-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.15131v1","updated":"2023-11-25T22:41:23Z","published":"2023-11-25T22:41:23Z","title":"Localizing Lying in Llama: Understanding Instructed Dishonesty on\n True-False Questions Through Prompting, Probing, and Patching","summary":" Large language models (LLMs) demonstrate significant knowledge through their\noutputs, though it is often unclear whether false outputs are due to a lack of\nknowledge or dishonesty. In this paper, we investigate instructed dishonesty,\nwherein we explicitly prompt LLaMA-2-70b-chat to lie. We perform prompt\nengineering to find which prompts best induce lying behavior, and then use\nmechanistic interpretability approaches to localize where in the network this\nbehavior occurs. Using linear probing and activation patching, we localize five\nlayers that appear especially important for lying. We then find just 46\nattention heads within these layers that enable us to causally intervene such\nthat the lying model instead answers honestly. We show that these interventions\nwork robustly across many prompts and dataset splits. Overall, our work\ncontributes a greater understanding of dishonesty in LLMs so that we may hope\nto prevent it.\n","authors":["James Campbell","Richard Ren","Phillip Guo"],"pdf_url":"https://arxiv.org/pdf/2311.15131v1.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2303.15445v3","updated":"2023-11-25T22:07:55Z","published":"2023-03-27T17:59:55Z","title":"IRFL: Image Recognition of Figurative Language","summary":" Figures of speech such as metaphors, similes, and idioms are integral parts\nof human communication. They are ubiquitous in many forms of discourse,\nallowing people to convey complex, abstract ideas and evoke emotion. As\nfigurative forms are often conveyed through multiple modalities (e.g., both\ntext and images), understanding multimodal figurative language is an important\nAI challenge, weaving together profound vision, language, commonsense and\ncultural knowledge. In this work, we develop the Image Recognition of\nFigurative Language (IRFL) dataset. We leverage human annotation and an\nautomatic pipeline we created to generate a multimodal dataset, and introduce\ntwo novel tasks as a benchmark for multimodal figurative language\nunderstanding. We experimented with state-of-the-art vision and language models\nand found that the best (22%) performed substantially worse than humans (97%).\nWe release our dataset, benchmark, and code, in hopes of driving the\ndevelopment of models that can better understand figurative language.\n","authors":["Ron Yosef","Yonatan Bitton","Dafna Shahaf"],"pdf_url":"https://arxiv.org/pdf/2303.15445v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.04568v2","updated":"2023-11-25T19:52:13Z","published":"2022-08-09T07:15:20Z","title":"The Impact of Data Corruption on Named Entity Recognition for\n Low-resourced Languages","summary":" Data availability and quality are major challenges in natural language\nprocessing for low-resourced languages. In particular, there is significantly\nless data available than for higher-resourced languages. This data is also\noften of low quality, rife with errors, invalid text or incorrect annotations.\nMany prior works focus on dealing with these problems, either by generating\nsynthetic data, or filtering out low-quality parts of datasets. We instead\ninvestigate these factors more deeply, by systematically measuring the effect\nof data quantity and quality on the performance of pre-trained language models\nin a low-resourced setting. Our results show that having fewer\ncompletely-labelled sentences is significantly better than having more\nsentences with missing labels; and that models can perform remarkably well with\nonly 10% of the training data. Importantly, these results are consistent across\nten low-resource languages, English, and four pre-trained models.\n","authors":["Manuel Fokam","Michael Beukman"],"pdf_url":"https://arxiv.org/pdf/2208.04568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15110v1","updated":"2023-11-25T19:50:41Z","published":"2023-11-25T19:50:41Z","title":"Relevance feedback strategies for recall-oriented neural information\n retrieval","summary":" In a number of information retrieval applications (e.g., patent search,\nliterature review, due diligence, etc.), preventing false negatives is more\nimportant than preventing false positives. However, approaches designed to\nreduce review effort (like \"technology assisted review\") can create false\nnegatives, since they are often based on active learning systems that exclude\ndocuments automatically based on user feedback. Therefore, this research\nproposes a more recall-oriented approach to reducing review effort. More\nspecifically, through iteratively re-ranking the relevance rankings based on\nuser feedback, which is also referred to as relevance feedback. In our proposed\nmethod, the relevance rankings are produced by a BERT-based dense-vector search\nand the relevance feedback is based on cumulatively summing the queried and\nselected embeddings. Our results show that this method can reduce review effort\nbetween 17.85% and 59.04%, compared to a baseline approach (of no feedback),\ngiven a fixed recall target\n","authors":["Timo Kats","Peter van der Putten","Jan Scholtes"],"pdf_url":"https://arxiv.org/pdf/2311.15110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15106v1","updated":"2023-11-25T19:35:53Z","published":"2023-11-25T19:35:53Z","title":"Solving the Right Problem is Key for Translational NLP: A Case Study in\n UMLS Vocabulary Insertion","summary":" As the immense opportunities enabled by large language models become more\napparent, NLP systems will be increasingly expected to excel in real-world\nsettings. However, in many instances, powerful models alone will not yield\ntranslational NLP solutions, especially if the formulated problem is not well\naligned with the real-world task. In this work, we study the case of UMLS\nvocabulary insertion, an important real-world task in which hundreds of\nthousands of new terms, referred to as atoms, are added to the UMLS, one of the\nmost comprehensive open-source biomedical knowledge bases. Previous work aimed\nto develop an automated NLP system to make this time-consuming, costly, and\nerror-prone task more efficient. Nevertheless, practical progress in this\ndirection has been difficult to achieve due to a problem formulation and\nevaluation gap between research output and the real-world task. In order to\naddress this gap, we introduce a new formulation for UMLS vocabulary insertion\nwhich mirrors the real-world task, datasets which faithfully represent it and\nseveral strong baselines we developed through re-purposing existing solutions.\nAdditionally, we propose an effective rule-enhanced biomedical language model\nwhich enables important new model behavior, outperforms all strong baselines\nand provides measurable qualitative improvements to editors who carry out the\nUVI task. We hope this case study provides insight into the considerable\nimportance of problem formulation for the success of translational NLP\nsolutions.\n","authors":["Bernal Jimenez Gutierrez","Yuqing Mao","Vinh Nguyen","Kin Wah Fung","Yu Su","Olivier Bodenreider"],"pdf_url":"https://arxiv.org/pdf/2311.15106v1.pdf","comment":"EMNLP 2023 Findings; Code is available at\n https://github.com/OSU-NLP-Group/UMLS-Vocabulary-Insertion"},{"id":"http://arxiv.org/abs/2310.19736v3","updated":"2023-11-25T17:35:12Z","published":"2023-10-30T17:00:52Z","title":"Evaluating Large Language Models: A Comprehensive Survey","summary":" Large language models (LLMs) have demonstrated remarkable capabilities across\na broad spectrum of tasks. They have attracted significant attention and been\ndeployed in numerous downstream applications. Nevertheless, akin to a\ndouble-edged sword, LLMs also present potential risks. They could suffer from\nprivate data leaks or yield inappropriate, harmful, or misleading content.\nAdditionally, the rapid progress of LLMs raises concerns about the potential\nemergence of superintelligent systems without adequate safeguards. To\neffectively capitalize on LLM capacities as well as ensure their safe and\nbeneficial development, it is critical to conduct a rigorous and comprehensive\nevaluation of LLMs.\n This survey endeavors to offer a panoramic perspective on the evaluation of\nLLMs. We categorize the evaluation of LLMs into three major groups: knowledge\nand capability evaluation, alignment evaluation and safety evaluation. In\naddition to the comprehensive review on the evaluation methodologies and\nbenchmarks on these three aspects, we collate a compendium of evaluations\npertaining to LLMs' performance in specialized domains, and discuss the\nconstruction of comprehensive evaluation platforms that cover LLM evaluations\non capabilities, alignment, safety, and applicability.\n We hope that this comprehensive overview will stimulate further research\ninterests in the evaluation of LLMs, with the ultimate goal of making\nevaluation serve as a cornerstone in guiding the responsible development of\nLLMs. We envision that this will channel their evolution into a direction that\nmaximizes societal benefit while minimizing potential risks. A curated list of\nrelated papers has been publicly available at\nhttps://github.com/tjunlp-lab/Awesome-LLMs-Evaluation-Papers.\n","authors":["Zishan Guo","Renren Jin","Chuang Liu","Yufei Huang","Dan Shi"," Supryadi","Linhao Yu","Yan Liu","Jiaxuan Li","Bojian Xiong","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2310.19736v3.pdf","comment":"111 pages"},{"id":"http://arxiv.org/abs/2305.07637v3","updated":"2023-11-25T17:19:36Z","published":"2023-05-12T17:46:06Z","title":"Text2Cohort: Facilitating Intuitive Access to Biomedical Data with\n Natural Language Cohort Discovery","summary":" The Imaging Data Commons (IDC) is a cloud-based database that provides\nresearchers with open access to cancer imaging data, with the goal of\nfacilitating collaboration. However, cohort discovery within the IDC database\nhas a significant technical learning curve. Recently, large language models\n(LLM) have demonstrated exceptional utility for natural language processing\ntasks. We developed Text2Cohort, a LLM-powered toolkit to facilitate\nuser-friendly natural language cohort discovery in the IDC. Our method\ntranslates user input into IDC queries using grounding techniques and returns\nthe query's response. We evaluate Text2Cohort on 50 natural language inputs,\nfrom information extraction to cohort discovery. Our toolkit successfully\ngenerated responses with an 88% accuracy and 0.94 F1 score. We demonstrate that\nText2Cohort can enable researchers to discover and curate cohorts on IDC with\nhigh levels of accuracy using natural language in a more intuitive and\nuser-friendly way.\n","authors":["Pranav Kulkarni","Adway Kanhere","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2305.07637v3.pdf","comment":"5 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2311.15077v1","updated":"2023-11-25T17:05:21Z","published":"2023-11-25T17:05:21Z","title":"Multilingual self-supervised speech representations improve the speech\n recognition of low-resource African languages with codeswitching","summary":" While many speakers of low-resource languages regularly code-switch between\ntheir languages and other regional languages or English, datasets of\ncodeswitched speech are too small to train bespoke acoustic models from scratch\nor do language model rescoring. Here we propose finetuning self-supervised\nspeech representations such as wav2vec 2.0 XLSR to recognize code-switched\ndata. We find that finetuning self-supervised multilingual representations and\naugmenting them with n-gram language models trained from transcripts reduces\nabsolute word error rates by up to 20% compared to baselines of hybrid models\ntrained from scratch on code-switched data. Our findings suggest that in\ncircumstances with limited training data finetuning self-supervised\nrepresentations is a better performing and viable solution.\n","authors":["Tolúlopé Ògúnrèmí","Christopher D. Manning","Dan Jurafsky"],"pdf_url":"https://arxiv.org/pdf/2311.15077v1.pdf","comment":"5 pages, 1 figure. Computational Approaches to Linguistic\n Code-Switching, CALCS 2023 (co-located with EMNLP 2023)"},{"id":"http://arxiv.org/abs/2311.15055v1","updated":"2023-11-25T15:27:10Z","published":"2023-11-25T15:27:10Z","title":"Automatically Finding and Categorizing Replication Studies","summary":" In many fields of experimental science, papers that failed to replicate\ncontinue to be cited as a result of the poor discoverability of replication\nstudies. As a first step to creating a system that automatically finds\nreplication studies for a given paper, 334 replication studies and 344\nreplicated studies were collected. Replication studies could be identified in\nthe dataset based on text content at a higher rate than chance (AUROC = 0.886).\n Additionally, successful replication studies could be distinguished from\nfailed replication studies at a higher rate than chance (AUROC = 0.664).\n","authors":["Bob de Ruiter"],"pdf_url":"https://arxiv.org/pdf/2311.15055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15054v1","updated":"2023-11-25T15:23:46Z","published":"2023-11-25T15:23:46Z","title":"Detection of developmental language disorder in Cypriot Greek children\n using a machine learning neural network algorithm","summary":" Children with developmental language disorder (DLD) encounter difficulties in\nacquiring various language structures. Early identification and intervention\nare crucial to prevent negative long-term outcomes impacting the academic,\nsocial, and emotional development of children. The study aims to develop an\nautomated method for the identification of DLD using artificial intelligence,\nspecifically a neural network machine learning algorithm. This protocol is\napplied for the first time in Cypriot Greek children, which is generally\nconsidered underresearched in the context of DLD. The neural network model was\ntrained using perceptual and production data elicited from children with DLD\nand healthy controls. The k-fold technique was used to crossvalidate the\nalgorithm. The performance of the model was evaluated using metrics such as\naccuracy, precision, recall, F1 score, and ROC/AUC curve to assess its ability\nto make accurate predictions on a set of unseen data. The results demonstrated\nhigh classification values for all metrics (between 0.92 and 0.98), indicating\nthe high accuracy of the neural model in classifying children with DLD.\nAdditionally, the variable importance analysis revealed that the language\nproduction skills of children had a more significant impact on the performance\nof the model compared to perception skills. Neural networks represent powerful\ntools for detecting DLD, providing early and quick assessments of the disorder,\nand having the potential to improve clinical outcomes.\n","authors":["Georgios P. Georgiou","Elena Theodorou"],"pdf_url":"https://arxiv.org/pdf/2311.15054v1.pdf","comment":"13 pages, 3 figures, journal article"},{"id":"http://arxiv.org/abs/2311.15032v1","updated":"2023-11-25T13:58:58Z","published":"2023-11-25T13:58:58Z","title":"nlpBDpatriots at BLP-2023 Task 2: A Transfer Learning Approach to Bangla\n Sentiment Analysis","summary":" In this paper, we discuss the nlpBDpatriots entry to the shared task on\nSentiment Analysis of Bangla Social Media Posts organized at the first workshop\non Bangla Language Processing (BLP) co-located with EMNLP. The main objective\nof this task is to identify the polarity of social media content using a Bangla\ndataset annotated with positive, neutral, and negative labels provided by the\nshared task organizers. Our best system for this task is a transfer learning\napproach with data augmentation which achieved a micro F1 score of 0.71. Our\nbest system ranked 12th among 30 teams that participated in the competition.\n","authors":["Dhiman Goswami","Md Nishat Raihan","Sadiya Sayara Chowdhury Puspo","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2311.15032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15029v1","updated":"2023-11-25T13:47:34Z","published":"2023-11-25T13:47:34Z","title":"nlpBDpatriots at BLP-2023 Task 1: A Two-Step Classification for Violence\n Inciting Text Detection in Bangla","summary":" In this paper, we discuss the nlpBDpatriots entry to the shared task on\nViolence Inciting Text Detection (VITD) organized as part of the first workshop\non Bangla Language Processing (BLP) co-located with EMNLP. The aim of this task\nis to identify and classify the violent threats, that provoke further unlawful\nviolent acts. Our best-performing approach for the task is two-step\nclassification using back translation and multilinguality which ranked 6th out\nof 27 teams with a macro F1 score of 0.74.\n","authors":["Md Nishat Raihan","Dhiman Goswami","Sadiya Sayara Chowdhury Puspo","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2311.15029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01828v2","updated":"2023-11-25T13:35:34Z","published":"2023-10-03T06:51:48Z","title":"Trainable Noise Model as an XAI evaluation method: application on Sobol\n for remote sensing image segmentation","summary":" eXplainable Artificial Intelligence (XAI) has emerged as an essential\nrequirement when dealing with mission-critical applications, ensuring\ntransparency and interpretability of the employed black box AI models. The\nsignificance of XAI spans various domains, from healthcare to finance, where\nunderstanding the decision-making process of deep learning algorithms is\nessential. Most AI-based computer vision models are often black boxes; hence,\nproviding explainability of deep neural networks in image processing is crucial\nfor their wide adoption and deployment in medical image analysis, autonomous\ndriving, and remote sensing applications. Recently, several XAI methods for\nimage classification tasks have been introduced. On the contrary, image\nsegmentation has received comparatively less attention in the context of\nexplainability, although it is a fundamental task in computer vision\napplications, especially in remote sensing. Only some research proposes\ngradient-based XAI algorithms for image segmentation. This paper adapts the\nrecent gradient-free Sobol XAI method for semantic segmentation. To measure the\nperformance of the Sobol method for segmentation, we propose a quantitative XAI\nevaluation method based on a learnable noise model. The main objective of this\nmodel is to induce noise on the explanation maps, where higher induced noise\nsignifies low accuracy and vice versa. A benchmark analysis is conducted to\nevaluate and compare performance of three XAI methods, including Seg-Grad-CAM,\nSeg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation\ntechnique. This constitutes the first attempt to run and evaluate XAI methods\nusing high-resolution satellite images.\n","authors":["Hossein Shreim","Abdul Karim Gizzini","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01828v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15023v1","updated":"2023-11-25T13:27:22Z","published":"2023-11-25T13:27:22Z","title":"Offensive Language Identification in Transliterated and Code-Mixed\n Bangla","summary":" Identifying offensive content in social media is vital for creating safe\nonline communities. Several recent studies have addressed this problem by\ncreating datasets for various languages. In this paper, we explore offensive\nlanguage identification in texts with transliterations and code-mixing,\nlinguistic phenomena common in multilingual societies, and a known challenge\nfor NLP systems. We introduce TB-OLID, a transliterated Bangla offensive\nlanguage dataset containing 5,000 manually annotated comments. We train and\nfine-tune machine learning models on TB-OLID, and we evaluate their results on\nthis dataset. Our results show that English pre-trained transformer-based\nmodels, such as fBERT and HateBERT achieve the best performance on this\ndataset.\n","authors":["Md Nishat Raihan","Umma Hani Tanmoy","Anika Binte Islam","Kai North","Tharindu Ranasinghe","Antonios Anastasopoulos","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2311.15023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18387v2","updated":"2023-11-25T13:13:01Z","published":"2023-10-27T09:59:35Z","title":"OffMix-3L: A Novel Code-Mixed Dataset in Bangla-English-Hindi for\n Offensive Language Identification","summary":" Code-mixing is a well-studied linguistic phenomenon when two or more\nlanguages are mixed in text or speech. Several works have been conducted on\nbuilding datasets and performing downstream NLP tasks on code-mixed data.\nAlthough it is not uncommon to observe code-mixing of three or more languages,\nmost available datasets in this domain contain code-mixed data from only two\nlanguages. In this paper, we introduce OffMix-3L, a novel offensive language\nidentification dataset containing code-mixed data from three different\nlanguages. We experiment with several models on this dataset and observe that\nBanglishBERT outperforms other transformer-based models and GPT-3.5.\n","authors":["Dhiman Goswami","Md Nishat Raihan","Antara Mahmud","Antonios Anastasopoulos","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2310.18387v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2310.18023"},{"id":"http://arxiv.org/abs/2311.15016v1","updated":"2023-11-25T12:47:39Z","published":"2023-11-25T12:47:39Z","title":"E-CORE: Emotion Correlation Enhanced Empathetic Dialogue Generation","summary":" Achieving empathy is a crucial step toward humanized dialogue systems.\nCurrent approaches for empathetic dialogue generation mainly perceive an\nemotional label to generate an empathetic response conditioned on it, which\nsimply treat emotions independently, but ignore the intrinsic emotion\ncorrelation in dialogues, resulting in inaccurate emotion perception and\nunsuitable response generation. In this paper, we propose a novel emotion\ncorrelation enhanced empathetic dialogue generation framework, which\ncomprehensively realizes emotion correlation learning, utilization, and\nsupervising. Specifically, a multi-resolution emotion graph is devised to\ncapture context-based emotion interactions from different resolutions, further\nmodeling emotion correlation. Then we propose an emotion correlation enhanced\ndecoder, with a novel correlation-aware aggregation and soft/hard strategy,\nrespectively improving the emotion perception and response generation.\nExperimental results on the benchmark dataset demonstrate the superiority of\nour model in both empathetic perception and expression.\n","authors":["Fengyi Fu","Lei Zhang","Quan Wang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2311.15016v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.00802v3","updated":"2023-11-25T09:58:18Z","published":"2023-08-01T19:34:18Z","title":"GRDD: A Dataset for Greek Dialectal NLP","summary":" In this paper, we present a dataset for the computational study of a number\nof Modern Greek dialects. It consists of raw text data from four dialects of\nModern Greek, Cretan, Pontic, Northern Greek and Cypriot Greek. The dataset is\nof considerable size, albeit imbalanced, and presents the first attempt to\ncreate large scale dialectal resources of this type for Modern Greek dialects.\nWe then use the dataset to perform dialect idefntification. We experiment with\ntraditional ML algorithms, as well as simple DL architectures. The results show\nvery good performance on the task, potentially revealing that the dialects in\nquestion have distinct enough characteristics allowing even simple ML models to\nperform well on the task. Error analysis is performed for the top performing\nalgorithms showing that in a number of cases the errors are due to insufficient\ndataset cleaning.\n","authors":["Stergios Chatzikyriakidis","Chatrine Qwaider","Ilias Kolokousis","Christina Koula","Dimitris Papadakis","Efthymia Sakellariou"],"pdf_url":"https://arxiv.org/pdf/2308.00802v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14966v1","updated":"2023-11-25T08:58:07Z","published":"2023-11-25T08:58:07Z","title":"Walking a Tightrope -- Evaluating Large Language Models in High-Risk\n Domains","summary":" High-risk domains pose unique challenges that require language models to\nprovide accurate and safe responses. Despite the great success of large\nlanguage models (LLMs), such as ChatGPT and its variants, their performance in\nhigh-risk domains remains unclear. Our study delves into an in-depth analysis\nof the performance of instruction-tuned LLMs, focusing on factual accuracy and\nsafety adherence. To comprehensively assess the capabilities of LLMs, we\nconduct experiments on six NLP datasets including question answering and\nsummarization tasks within two high-risk domains: legal and medical. Further\nqualitative analysis highlights the existing limitations inherent in current\nLLMs when evaluating in high-risk domains. This underscores the essential\nnature of not only improving LLM capabilities but also prioritizing the\nrefinement of domain-specific metrics, and embracing a more human-centric\napproach to enhance safety and factual reliability. Our findings advance the\nfield toward the concerns of properly evaluating LLMs in high-risk domains,\naiming to steer the adaptability of LLMs in fulfilling societal obligations and\naligning with forthcoming regulations, such as the EU AI Act.\n","authors":["Chia-Chien Hung","Wiem Ben Rim","Lindsay Frost","Lars Bruckner","Carolin Lawrence"],"pdf_url":"https://arxiv.org/pdf/2311.14966v1.pdf","comment":"EMNLP 2023 Workshop on Benchmarking Generalisation in NLP (GenBench)"},{"id":"http://arxiv.org/abs/2311.14949v1","updated":"2023-11-25T07:13:06Z","published":"2023-11-25T07:13:06Z","title":"Vector-Quantized Prompt Learning for Paraphrase Generation","summary":" Deep generative modeling of natural languages has achieved many successes,\nsuch as producing fluent sentences and translating from one language into\nanother. However, the development of generative modeling techniques for\nparaphrase generation still lags behind largely due to the challenges in\naddressing the complex conflicts between expression diversity and semantic\npreservation. This paper proposes to generate diverse and high-quality\nparaphrases by exploiting the pre-trained models with instance-dependent\nprompts. To learn generalizable prompts, we assume that the number of abstract\ntransforming patterns of paraphrase generation (governed by prompts) is finite\nand usually not large. Therefore, we present vector-quantized prompts as the\ncues to control the generation of pre-trained models. Extensive experiments\ndemonstrate that the proposed method achieves new state-of-art results on three\nbenchmark datasets, including Quora, Wikianswers, and MSCOCO. We will release\nall the code upon acceptance.\n","authors":["Haotian Luo","Yixin Liu","Peidong Liu","Xianggen Liu"],"pdf_url":"https://arxiv.org/pdf/2311.14949v1.pdf","comment":"EMNLP Findings, 2023"},{"id":"http://arxiv.org/abs/2311.14919v1","updated":"2023-11-25T03:38:14Z","published":"2023-11-25T03:38:14Z","title":"Faster Minimum Bayes Risk Decoding with Confidence-based Pruning","summary":" Minimum Bayes risk (MBR) decoding outputs the hypothesis with the highest\nexpected utility over the model distribution for some utility function. It has\nbeen shown to improve accuracy over beam search in conditional language\ngeneration problems and especially neural machine translation, in both human\nand automatic evaluations. However, the standard sampling-based algorithm for\nMBR is substantially more computationally expensive than beam search, requiring\na large number of samples as well as a quadratic number of calls to the utility\nfunction, limiting its applicability. We describe an algorithm for MBR which\ngradually grows the number of samples used to estimate the utility while\npruning hypotheses that are unlikely to have the highest utility according to\nconfidence estimates obtained with bootstrap sampling. Our method requires\nfewer samples and drastically reduces the number of calls to the utility\nfunction compared to standard MBR while being statistically indistinguishable\nin terms of accuracy. We demonstrate the effectiveness of our approach in\nexperiments on three language pairs, using chrF++ and COMET as\nutility/evaluation metrics.\n","authors":["Julius Cheng","Andreas Vlachos"],"pdf_url":"https://arxiv.org/pdf/2311.14919v1.pdf","comment":"Updated from EMNLP 2023 version: typo fix, minor math notation\n change, updated citation"},{"id":"http://arxiv.org/abs/2311.14901v1","updated":"2023-11-25T02:31:22Z","published":"2023-11-25T02:31:22Z","title":"Code Search Debiasing:Improve Search Results beyond Overall Ranking\n Performance","summary":" Code search engine is an essential tool in software development. Many code\nsearch methods have sprung up, focusing on the overall ranking performance of\ncode search. In this paper, we study code search from another perspective by\nanalyzing the bias of code search models. Biased code search engines provide\npoor user experience, even though they show promising overall performance. Due\nto different development conventions (e.g., prefer long queries or\nabbreviations), some programmers will find the engine useful, while others may\nfind it hard to get desirable search results. To mitigate biases, we develop a\ngeneral debiasing framework that employs reranking to calibrate search results.\nIt can be easily plugged into existing engines and handle new code search\nbiases discovered in the future. Experiments show that our framework can\neffectively reduce biases. Meanwhile, the overall ranking performance of code\nsearch gets improved after debiasing.\n","authors":["Sheng Zhang","Hui Li","Yanlin Wang","Zhao Wei","Yong Xiu","Juhong Wang","Rongong Ji"],"pdf_url":"https://arxiv.org/pdf/2311.14901v1.pdf","comment":"Accepted to Findings of EMNLP 2023. 11 pages"},{"id":"http://arxiv.org/abs/2310.10520v3","updated":"2023-11-25T02:09:35Z","published":"2023-10-16T15:38:02Z","title":"Semantic Parsing by Large Language Models for Intricate Updating\n Strategies of Zero-Shot Dialogue State Tracking","summary":" Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring\nand annotating task-oriented dialogues, which can be time-consuming and costly.\nHowever, DST extends beyond simple slot-filling and requires effective updating\nstrategies for tracking dialogue state as conversations progress. In this\npaper, we propose ParsingDST, a new In-Context Learning (ICL) method, to\nintroduce additional intricate updating strategies in zero-shot DST. Our\napproach reformulates the DST task by leveraging powerful Large Language Models\n(LLMs) and translating the original dialogue text to JSON through semantic\nparsing as an intermediate state. We also design a novel framework that\nincludes more modules to ensure the effectiveness of updating strategies in the\ntext-to-JSON process. Experimental results demonstrate that our approach\noutperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant\nimprovements in Joint Goal Accuracy (JGA) and slot accuracy compared to\nexisting ICL methods. Our code has been released.\n","authors":["Yuxiang Wu","Guanting Dong","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2310.10520v3.pdf","comment":"Accepted to the Findings of EMNLP 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2308.10819v3","updated":"2023-11-25T00:25:36Z","published":"2023-08-17T06:21:50Z","title":"Evaluating the Instruction-Following Robustness of Large Language Models\n to Prompt Injection","summary":" Large Language Models (LLMs) have demonstrated exceptional proficiency in\ninstruction-following, becoming increasingly crucial across various\napplications. However, this capability brings with it the risk of prompt\ninjection attacks, where attackers inject instructions into LLMs' input to\nelicit undesirable actions or content. Understanding the robustness of LLMs\nagainst such attacks is vital for their safe implementation. In this work, we\nestablish a benchmark to evaluate the robustness of instruction-following LLMs\nagainst prompt injection attacks. Our objective is to determine the extent to\nwhich LLMs can be influenced by injected instructions and their ability to\ndifferentiate between these injected and original target instructions. Through\nextensive experiments with leading instruction-following LLMs, we uncover\nsignificant vulnerabilities in their robustness to such attacks. Our results\nindicate that some models are overly tuned to follow any embedded instructions\nin the prompt, overly focusing on the latter parts of the prompt without fully\ngrasping the entire context. By contrast, models with a better grasp of the\ncontext and instruction-following capabilities will potentially be more\nsusceptible to compromise by injected instructions. This underscores the need\nto shift the focus from merely enhancing LLMs' instruction-following\ncapabilities to improving their overall comprehension of prompts and\ndiscernment of instructions that are appropriate to follow. We hope our\nin-depth analysis offers insights into the underlying causes of these\nvulnerabilities, aiding in the development of future solutions. Code and data\nare available at\nhttps://github.com/Leezekun/instruction-following-robustness-eval\n","authors":["Zekun Li","Baolin Peng","Pengcheng He","Xifeng Yan"],"pdf_url":"https://arxiv.org/pdf/2308.10819v3.pdf","comment":"The data and code can be found at\n https://github.com/Leezekun/instruction-following-robustness-eval"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2305.07637v3","updated":"2023-11-25T17:19:36Z","published":"2023-05-12T17:46:06Z","title":"Text2Cohort: Facilitating Intuitive Access to Biomedical Data with\n Natural Language Cohort Discovery","summary":" The Imaging Data Commons (IDC) is a cloud-based database that provides\nresearchers with open access to cancer imaging data, with the goal of\nfacilitating collaboration. However, cohort discovery within the IDC database\nhas a significant technical learning curve. Recently, large language models\n(LLM) have demonstrated exceptional utility for natural language processing\ntasks. We developed Text2Cohort, a LLM-powered toolkit to facilitate\nuser-friendly natural language cohort discovery in the IDC. Our method\ntranslates user input into IDC queries using grounding techniques and returns\nthe query's response. We evaluate Text2Cohort on 50 natural language inputs,\nfrom information extraction to cohort discovery. Our toolkit successfully\ngenerated responses with an 88% accuracy and 0.94 F1 score. We demonstrate that\nText2Cohort can enable researchers to discover and curate cohorts on IDC with\nhigh levels of accuracy using natural language in a more intuitive and\nuser-friendly way.\n","authors":["Pranav Kulkarni","Adway Kanhere","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2305.07637v3.pdf","comment":"5 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2310.20189v2","updated":"2023-11-25T11:46:49Z","published":"2023-10-31T05:16:54Z","title":"LFG: A Generative Network for Real-Time Recommendation","summary":" Recommender systems are essential information technologies today, and\nrecommendation algorithms combined with deep learning have become a research\nhotspot in this field. The recommendation model known as LFM (Latent Factor\nModel), which captures latent features through matrix factorization and\ngradient descent to fit user preferences, has given rise to various\nrecommendation algorithms that bring new improvements in recommendation\naccuracy. However, collaborative filtering recommendation models based on LFM\nlack flexibility and has shortcomings for real-time recommendations, as they\nneed to redo the matrix factorization and retrain using gradient descent when\nnew users arrive. In response to this, this paper innovatively proposes a\nLatent Factor Generator (LFG) network, and set the movie recommendation as\nresearch theme. The LFG dynamically generates user latent factors through deep\nneural networks without the need for re-factorization or retrain. Experimental\nresults indicate that the LFG recommendation model outperforms traditional\nmatrix factorization algorithms in recommendation accuracy, providing an\neffective solution to the challenges of real-time recommendations with LFM.\n","authors":["Junyi Liu"],"pdf_url":"https://arxiv.org/pdf/2310.20189v2.pdf","comment":"9 pages, 1 figure, 4 tables. Source code would be uploaded to github\n soon"},{"id":"http://arxiv.org/abs/2311.14968v1","updated":"2023-11-25T08:59:45Z","published":"2023-11-25T08:59:45Z","title":"Hide Your Model: A Parameter Transmission-free Federated Recommender\n System","summary":" With the growing concerns regarding user data privacy, Federated Recommender\nSystem (FedRec) has garnered significant attention recently due to its\nprivacy-preserving capabilities. Existing FedRecs generally adhere to a\nlearning protocol in which a central server shares a global recommendation\nmodel with clients, and participants achieve collaborative learning by\nfrequently communicating the model's public parameters. Nevertheless, this\nlearning framework has two drawbacks that limit its practical usability: (1) It\nnecessitates a global-sharing recommendation model; however, in real-world\nscenarios, information related to the recommender model, including its\nalgorithm and parameters, constitutes the platforms' intellectual property.\nHence, service providers are unlikely to release such information actively. (2)\nThe communication costs of model parameter transmission are expensive since the\nmodel parameters are usually high-dimensional matrices. With the model size\nincreasing, the communication burden will be the bottleneck for such\ntraditional FedRecs.\n Given the above limitations, this paper introduces a novel parameter\ntransmission-free federated recommendation framework that balances the\nprotection between users' data privacy and platforms' model privacy, namely\nPTF-FedRec. Specifically, participants in PTF-FedRec collaboratively exchange\nknowledge by sharing their predictions within a privacy-preserving mechanism.\nThrough this way, the central server can learn a recommender model without\ndisclosing its model parameters or accessing clients' raw data, preserving both\nthe server's model privacy and users' data privacy. Besides, since clients and\nthe central server only need to communicate prediction scores which are just a\nfew real numbers, the overhead is significantly reduced compared to traditional\nFedRecs.\n","authors":["Wei Yuan","Chaoqun Yang","Liang Qu","Quoc Viet Hung Nguyen","Jianxin Li","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2311.14968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14318v3","updated":"2023-11-25T06:02:47Z","published":"2023-10-22T14:41:10Z","title":"Intent Contrastive Learning with Cross Subsequences for Sequential\n Recommendation","summary":" The user purchase behaviors are mainly influenced by their intentions (e.g.,\nbuying clothes for decoration, buying brushes for painting, etc.). Modeling a\nuser's latent intention can significantly improve the performance of\nrecommendations. Previous works model users' intentions by considering the\npredefined label in auxiliary information or introducing stochastic data\naugmentation to learn purposes in the latent space. However, the auxiliary\ninformation is sparse and not always available for recommender systems, and\nintroducing stochastic data augmentation may introduce noise and thus change\nthe intentions hidden in the sequence. Therefore, leveraging user intentions\nfor sequential recommendation (SR) can be challenging because they are\nfrequently varied and unobserved. In this paper, Intent contrastive learning\nwith Cross Subsequences for sequential Recommendation (ICSRec) is proposed to\nmodel users' latent intentions. Specifically, ICSRec first segments a user's\nsequential behaviors into multiple subsequences by using a dynamic sliding\noperation and takes these subsequences into the encoder to generate the\nrepresentations for the user's intentions. To tackle the problem of no explicit\nlabels for purposes, ICSRec assumes different subsequences with the same target\nitem may represent the same intention and proposes a coarse-grain intent\ncontrastive learning to push these subsequences closer. Then, fine-grain intent\ncontrastive learning is mentioned to capture the fine-grain intentions of\nsubsequences in sequential behaviors. Extensive experiments conducted on four\nreal-world datasets demonstrate the superior performance of the proposed ICSRec\nmodel compared with baseline methods.\n","authors":["Xiuyuan Qin","Huanhuan Yuan","Pengpeng Zhao","Guanfeng Liu","Fuzhen Zhuang","Victor S. Sheng"],"pdf_url":"https://arxiv.org/pdf/2310.14318v3.pdf","comment":"10pages, 5figures, WSDM2024. arXiv admin note: text overlap with\n arXiv:2304.07763"},{"id":"http://arxiv.org/abs/2311.16515v1","updated":"2023-11-25T14:24:49Z","published":"2023-11-25T14:24:49Z","title":"Word for Person: Zero-shot Composed Person Retrieval","summary":" Searching for specific person has great security value and social benefits,\nand it often involves a combination of visual and textual information.\nConventional person retrieval methods, whether image-based or text-based,\nusually fall short in effectively harnessing both types of information, leading\nto the loss of accuracy. In this paper, a whole new task called Composed Person\nRetrieval (CPR) is proposed to jointly utilize both image and text information\nfor target person retrieval. However, the supervised CPR must depend on very\ncostly manual annotation dataset, while there are currently no available\nresources. To mitigate this issue, we firstly introduce the Zero-shot Composed\nPerson Retrieval (ZS-CPR), which leverages existing domain-related data to\nresolve the CPR problem without reliance on expensive annotations. Secondly, to\nlearn ZS-CPR model, we propose a two-stage learning framework, Word4Per, where\na lightweight Textual Inversion Network (TINet) and a text-based person\nretrieval model based on fine-tuned Contrastive Language-Image Pre-training\n(CLIP) network are learned without utilizing any CPR data. Thirdly, a finely\nannotated Image-Text Composed Person Retrieval dataset (ITCPR) is built as the\nbenchmark to assess the performance of the proposed Word4Per framework.\nExtensive experiments under both Rank-1 and mAP demonstrate the effectiveness\nof Word4Per for the ZS-CPR task, surpassing the comparative methods by over\n10%. The code and ITCPR dataset will be publicly available at\nhttps://github.com/Delong-liu-bupt/Word4Per.\n","authors":["Delong Liu","Haiwen Li","Zhicheng Zhao","Fei Su","Hongying Meng"],"pdf_url":"https://arxiv.org/pdf/2311.16515v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.15080v1","updated":"2023-11-25T17:18:35Z","published":"2023-11-25T17:18:35Z","title":"Weakly-Supervised Audio-Visual Segmentation","summary":" Audio-visual segmentation is a challenging task that aims to predict\npixel-level masks for sound sources in a video. Previous work applied a\ncomprehensive manually designed architecture with countless pixel-wise accurate\nmasks as supervision. However, these pixel-level masks are expensive and not\navailable in all cases. In this work, we aim to simplify the supervision as the\ninstance-level annotation, i.e., weakly-supervised audio-visual segmentation.\nWe present a novel Weakly-Supervised Audio-Visual Segmentation framework,\nnamely WS-AVS, that can learn multi-scale audio-visual alignment with\nmulti-scale multiple-instance contrastive learning for audio-visual\nsegmentation. Extensive experiments on AVSBench demonstrate the effectiveness\nof our WS-AVS in the weakly-supervised audio-visual segmentation of\nsingle-source and multi-source scenarios.\n","authors":["Shentong Mo","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2311.15080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14977v1","updated":"2023-11-25T09:38:24Z","published":"2023-11-25T09:38:24Z","title":"Incorporating granularity bias as the margin into contrastive loss for\n video captioning","summary":" Video captioning models easily suffer from long-tail distribution of phrases,\nwhich makes captioning models prone to generate vague sentences instead of\naccurate ones. However, existing debiasing strategies tend to export external\nknowledge to build dependency trees of words or refine frequency distribution\nby complex losses and extra input features, which lack interpretability and are\nhard to train. To mitigate the impact of granularity bias on the model, we\nintroduced a statistical-based bias extractor. This extractor quantifies the\ninformation content within sentences and videos, providing an estimate of the\nlikelihood that a video-sentence pair is affected by granularity bias.\nFurthermore, with the growing trend of integrating contrastive learning methods\ninto video captioning tasks, we use a bidirectional triplet loss to get more\nnegative samples in a batch. Subsequently, we incorporate the margin score into\nthe contrastive learning loss, establishing distinct training objectives for\nhead and tail sentences. This approach facilitates the model's training\neffectiveness on tail samples. Our simple yet effective loss, incorporating\nGranularity bias, is referred to as the Margin-Contrastive Loss (GMC Loss). The\nproposed model demonstrates state-of-the-art performance on MSRVTT with a CIDEr\nof 57.17, and MSVD, where CIDEr reaches up to 138.68.\n","authors":["Jiayang Gu","Fengming Yao"],"pdf_url":"https://arxiv.org/pdf/2311.14977v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.08172v2","updated":"2023-11-25T07:59:48Z","published":"2023-11-14T14:02:32Z","title":"Vision-Language Instruction Tuning: A Review and Analysis","summary":" Instruction tuning is a crucial supervised training phase in Large Language\nModels (LLMs), aiming to enhance the LLM's ability to generalize instruction\nexecution and adapt to user preferences. With the increasing integration of\nmulti-modal data into LLMs, there is growing interest in Vision-Language\nInstruction Tuning (VLIT), which presents more complex characteristics compared\nto pure text instruction tuning. In this paper, we systematically review the\nlatest VLIT settings and corresponding datasets in multi-modal LLMs and provide\ninsights into the intrinsic motivations behind their design. For the first\ntime, we offer a detailed multi-perspective categorization for existing VLIT\ndatasets and identify the characteristics that high-quality VLIT data should\npossess. By incorporating these characteristics as guiding principles into the\nexisting VLIT data construction process, we conduct extensive experiments and\nverify their positive impact on the performance of tuned multi-modal LLMs.\nFurthermore, we discuss the current challenges and future research directions\nof VLIT, providing insights for the continuous development of this field. The\ncode and dataset related to this paper have been open-sourced at\nhttps://github.com/palchenli/VL-Instruction-Tuning.\n","authors":["Chen Li","Yixiao Ge","Dian Li","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2311.08172v2.pdf","comment":"34 pages, 6 figures"}]},"2023-11-28T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.17049v1","updated":"2023-11-28T18:55:42Z","published":"2023-11-28T18:55:42Z","title":"MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced\n Training","summary":" Contrastive pretraining of image-text foundation models, such as CLIP,\ndemonstrated excellent zero-shot performance and improved robustness on a wide\nrange of downstream tasks. However, these models utilize large\ntransformer-based encoders with significant memory and latency overhead which\npose challenges for deployment on mobile devices. In this work, we introduce\nMobileCLIP -- a new family of efficient image-text models optimized for runtime\nperformance along with a novel and efficient training approach, namely\nmulti-modal reinforced training. The proposed training approach leverages\nknowledge transfer from an image captioning model and an ensemble of strong\nCLIP encoders to improve the accuracy of efficient models. Our approach avoids\ntrain-time compute overhead by storing the additional knowledge in a reinforced\ndataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for\nzero-shot classification and retrieval tasks on several datasets. Our\nMobileCLIP-S2 variant is 2.3$\\times$ faster while more accurate compared to\nprevious best CLIP model based on ViT-B/16. We further demonstrate the\neffectiveness of our multi-modal reinforced training by training a CLIP model\nbased on ViT-B/16 image backbone and achieving +2.9% average performance\nimprovement on 38 evaluation benchmarks compared to the previous best.\nMoreover, we show that the proposed approach achieves 10$\\times$-1000$\\times$\nimproved learning efficiency when compared with non-reinforced CLIP training.\n","authors":["Pavan Kumar Anasosalu Vasu","Hadi Pouransari","Fartash Faghri","Raviteja Vemulapalli","Oncel Tuzel"],"pdf_url":"https://arxiv.org/pdf/2311.17049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17043v1","updated":"2023-11-28T18:53:43Z","published":"2023-11-28T18:53:43Z","title":"LLaMA-VID: An Image is Worth 2 Tokens in Large Language Models","summary":" In this work, we present a novel method to tackle the token generation\nchallenge in Vision Language Models (VLMs) for video and image understanding,\ncalled LLaMA-VID. Current VLMs, while proficient in tasks like image captioning\nand visual question answering, face computational burdens when processing long\nvideos due to the excessive visual tokens. LLaMA-VID addresses this issue by\nrepresenting each frame with two distinct tokens, namely context token and\ncontent token. The context token encodes the overall image context based on\nuser input, whereas the content token encapsulates visual cues in each frame.\nThis dual-token strategy significantly reduces the overload of long videos\nwhile preserving critical information. Generally, LLaMA-VID empowers existing\nframeworks to support hour-long videos and pushes their upper limit with an\nextra context token. It is proved to surpass previous methods on most of video-\nor image-based benchmarks. Code is available\nhttps://github.com/dvlab-research/LLaMA-VID}{https://github.com/dvlab-research/LLaMA-VID\n","authors":["Yanwei Li","Chengyao Wang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2311.17043v1.pdf","comment":"Code is available at https://github.com/dvlab-research/LLaMA-VID"},{"id":"http://arxiv.org/abs/2311.17041v1","updated":"2023-11-28T18:53:06Z","published":"2023-11-28T18:53:06Z","title":"Efficient In-Context Learning in Vision-Language Models for Egocentric\n Videos","summary":" Recent advancements in text-only large language models (LLMs) have\nhighlighted the benefit of in-context learning for adapting to new tasks with a\nfew demonstrations. However, extending in-context learning to large\nvision-language models (VLMs) using a huge amount of naturalistic\nvision-language data has shown limited success, particularly for egocentric\nvideos, due to high data collection costs. We propose a novel training method\n$\\mathbb{E}$fficient $\\mathbb{I}$n-context $\\mathbb{L}$earning on\n$\\mathbb{E}$gocentric $\\mathbb{V}$ideos ($\\mathbb{EILEV}$), which elicits\nin-context learning in VLMs for egocentric videos without requiring massive,\nnaturalistic egocentric video datasets. $\\mathbb{EILEV}$ involves architectural\nand training data adaptations to allow the model to process contexts\ninterleaved with video clips and narrations, sampling of in-context examples\nwith clusters of similar verbs and nouns, use of data with skewed marginal\ndistributions with a long tail of infrequent verbs and nouns, as well as\nhomonyms and synonyms. Our evaluations show that $\\mathbb{EILEV}$-trained\nmodels outperform larger VLMs trained on a huge amount of naturalistic data in\nin-context learning. Furthermore, they can generalize to not only\nout-of-distribution, but also novel, rare egocentric videos and texts via\nin-context learning, demonstrating potential for applications requiring\ncost-effective training, and rapid post-deployment adaptability. Our code and\ndemo are available at \\url{https://github.com/yukw777/EILEV}.\n","authors":["Keunwoo Peter Yu","Zheyuan Zhang","Fengyuan Hu","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2311.17041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17035v1","updated":"2023-11-28T18:47:03Z","published":"2023-11-28T18:47:03Z","title":"Scalable Extraction of Training Data from (Production) Language Models","summary":" This paper studies extractable memorization: training data that an adversary\ncan efficiently extract by querying a machine learning model without prior\nknowledge of the training dataset. We show an adversary can extract gigabytes\nof training data from open-source language models like Pythia or GPT-Neo,\nsemi-open models like LLaMA or Falcon, and closed models like ChatGPT. Existing\ntechniques from the literature suffice to attack unaligned models; in order to\nattack the aligned ChatGPT, we develop a new divergence attack that causes the\nmodel to diverge from its chatbot-style generations and emit training data at a\nrate 150x higher than when behaving properly. Our methods show practical\nattacks can recover far more data than previously thought, and reveal that\ncurrent alignment techniques do not eliminate memorization.\n","authors":["Milad Nasr","Nicholas Carlini","Jonathan Hayase","Matthew Jagielski","A. Feder Cooper","Daphne Ippolito","Christopher A. Choquette-Choo","Eric Wallace","Florian Tramèr","Katherine Lee"],"pdf_url":"https://arxiv.org/pdf/2311.17035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17030v1","updated":"2023-11-28T18:32:19Z","published":"2023-11-28T18:32:19Z","title":"Is This the Subspace You Are Looking for? An Interpretability Illusion\n for Subspace Activation Patching","summary":" Mechanistic interpretability aims to understand model behaviors in terms of\nspecific, interpretable features, often hypothesized to manifest as\nlow-dimensional subspaces of activations. Specifically, recent studies have\nexplored subspace interventions (such as activation patching) as a way to\nsimultaneously manipulate model behavior and attribute the features behind it\nto given subspaces.\n In this work, we demonstrate that these two aims diverge, potentially leading\nto an illusory sense of interpretability. Counterintuitively, even if a\nsubspace intervention makes the model's output behave as if the value of a\nfeature was changed, this effect may be achieved by activating a dormant\nparallel pathway leveraging another subspace that is causally disconnected from\nmodel outputs. We demonstrate this phenomenon in a distilled mathematical\nexample, in two real-world domains (the indirect object identification task and\nfactual recall), and present evidence for its prevalence in practice. In the\ncontext of factual recall, we further show a link to rank-1 fact editing,\nproviding a mechanistic explanation for previous work observing an\ninconsistency between fact editing performance and fact localization.\n However, this does not imply that activation patching of subspaces is\nintrinsically unfit for interpretability. To contextualize our findings, we\nalso show what a success case looks like in a task (indirect object\nidentification) where prior manual circuit analysis informs an understanding of\nthe location of a feature. We explore the additional evidence needed to argue\nthat a patched subspace is faithful.\n","authors":["Aleksandar Makelov","Georg Lange","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2311.17030v1.pdf","comment":"NeurIPS 2023 Workshop on Attributing Model Behavior at Scale"},{"id":"http://arxiv.org/abs/2310.04438v2","updated":"2023-11-28T18:27:54Z","published":"2023-09-30T22:27:37Z","title":"A Brief History of Prompt: Leveraging Language Models. (Through Advanced\n Prompting)","summary":" This paper presents a comprehensive exploration of the evolution of prompt\nengineering and generation in the field of natural language processing (NLP).\nStarting from the early language models and information retrieval systems, we\ntrace the key developments that have shaped prompt engineering over the years.\nThe introduction of attention mechanisms in 2015 revolutionized language\nunderstanding, leading to advancements in controllability and\ncontext-awareness. Subsequent breakthroughs in reinforcement learning\ntechniques further enhanced prompt engineering, addressing issues like exposure\nbias and biases in generated text. We examine the significant contributions in\n2018 and 2019, focusing on fine-tuning strategies, control codes, and\ntemplate-based generation. The paper also discusses the growing importance of\nfairness, human-AI collaboration, and low-resource adaptation. In 2020 and\n2021, contextual prompting and transfer learning gained prominence, while 2022\nand 2023 witnessed the emergence of advanced techniques like unsupervised\npre-training and novel reward shaping. Throughout the paper, we reference\nspecific research studies that exemplify the impact of various developments on\nprompt engineering. The journey of prompt engineering continues, with ethical\nconsiderations being paramount for the responsible and inclusive future of AI\nsystems.\n","authors":["Golam Md Muktadir"],"pdf_url":"https://arxiv.org/pdf/2310.04438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01270v2","updated":"2023-11-28T18:23:48Z","published":"2023-11-02T14:31:25Z","title":"People Make Better Edits: Measuring the Efficacy of LLM-Generated\n Counterfactually Augmented Data for Harmful Language Detection","summary":" NLP models are used in a variety of critical social computing tasks, such as\ndetecting sexist, racist, or otherwise hateful content. Therefore, it is\nimperative that these models are robust to spurious features. Past work has\nattempted to tackle such spurious features using training data augmentation,\nincluding Counterfactually Augmented Data (CADs). CADs introduce minimal\nchanges to existing training data points and flip their labels; training on\nthem may reduce model dependency on spurious features. However, manually\ngenerating CADs can be time-consuming and expensive. Hence in this work, we\nassess if this task can be automated using generative NLP models. We\nautomatically generate CADs using Polyjuice, ChatGPT, and Flan-T5, and evaluate\ntheir usefulness in improving model robustness compared to manually-generated\nCADs. By testing both model performance on multiple out-of-domain test sets and\nindividual data point efficacy, our results show that while manual CADs are\nstill the most effective, CADs generated by ChatGPT come a close second. One\nkey reason for the lower performance of automated methods is that the changes\nthey introduce are often insufficient to flip the original label.\n","authors":["Indira Sen","Dennis Assenmacher","Mattia Samory","Isabelle Augenstein","Wil van der Aalst","Claudia Wagner"],"pdf_url":"https://arxiv.org/pdf/2311.01270v2.pdf","comment":"Preprint of EMNLP'23 paper"},{"id":"http://arxiv.org/abs/2311.16989v1","updated":"2023-11-28T17:44:51Z","published":"2023-11-28T17:44:51Z","title":"ChatGPT's One-year Anniversary: Are Open-Source Large Language Models\n Catching up?","summary":" Upon its release in late 2022, ChatGPT has brought a seismic shift in the\nentire landscape of AI, both in research and commerce. Through\ninstruction-tuning a large language model (LLM) with supervised fine-tuning and\nreinforcement learning from human feedback, it showed that a model could answer\nhuman questions and follow instructions on a broad panel of tasks. Following\nthis success, interests in LLMs have intensified, with new LLMs flourishing at\nfrequent interval across academia and industry, including many start-ups\nfocused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's\nClaude) generally outperform their open-source counterparts, the progress on\nthe latter has been rapid with claims of achieving parity or even better on\ncertain tasks. This has crucial implications not only on research but also on\nbusiness. In this work, on the first anniversary of ChatGPT, we provide an\nexhaustive overview of this success, surveying all tasks where an open-source\nLLM has claimed to be on par or better than ChatGPT.\n","authors":["Hailin Chen","Fangkai Jiao","Xingxuan Li","Chengwei Qin","Mathieu Ravaut","Ruochen Zhao","Caiming Xiong","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2311.16989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16978v1","updated":"2023-11-28T17:25:34Z","published":"2023-11-28T17:25:34Z","title":"Assessing the influence of attractor-verb distance on grammatical\n agreement in humans and language models","summary":" Subject-verb agreement in the presence of an attractor noun located between\nthe main noun and the verb elicits complex behavior: judgments of\ngrammaticality are modulated by the grammatical features of the attractor. For\nexample, in the sentence \"The girl near the boys likes climbing\", the attractor\n(boys) disagrees in grammatical number with the verb (likes), creating a\nlocally implausible transition probability. Here, we parametrically modulate\nthe distance between the attractor and the verb while keeping the length of the\nsentence equal. We evaluate the performance of both humans and two artificial\nneural network models: both make more mistakes when the attractor is closer to\nthe verb, but neural networks get close to the chance level while humans are\nmostly able to overcome the attractor interference. Additionally, we report a\nlinear effect of attractor distance on reaction times. We hypothesize that a\npossible reason for the proximity effect is the calculation of transition\nprobabilities between adjacent words. Nevertheless, classical models of\nattraction such as the cue-based model might suffice to explain this\nphenomenon, thus paving the way for new research. Data and analyses available\nat https://osf.io/d4g6k\n","authors":["Christos-Nikolaos Zacharopoulos","Théo Desbordes","Mathias Sablé-Meyer"],"pdf_url":"https://arxiv.org/pdf/2311.16978v1.pdf","comment":"10 pages (5 main, 2 refs, 3 supplementary) ; 5 figures (3 main, 2\n supplementary) ; accepted at EMNLP 2023 (no DOI yet)"},{"id":"http://arxiv.org/abs/2311.16965v1","updated":"2023-11-28T17:12:06Z","published":"2023-11-28T17:12:06Z","title":"Natural Language Processing Through Transfer Learning: A Case Study on\n Sentiment Analysis","summary":" Artificial intelligence and machine learning have significantly bolstered the\ntechnological world. This paper explores the potential of transfer learning in\nnatural language processing focusing mainly on sentiment analysis. The models\ntrained on the big data can also be used where data are scarce. The claim is\nthat, compared to training models from scratch, transfer learning, using\npre-trained BERT models, can increase sentiment classification accuracy. The\nstudy adopts a sophisticated experimental design that uses the IMDb dataset of\nsentimentally labelled movie reviews. Pre-processing includes tokenization and\nencoding of text data, making it suitable for NLP models. The dataset is used\non a BERT based model, measuring its performance using accuracy. The result\ncomes out to be 100 per cent accurate. Although the complete accuracy could\nappear impressive, it might be the result of overfitting or a lack of\ngeneralization. Further analysis is required to ensure the model's ability to\nhandle diverse and unseen data. The findings underscore the effectiveness of\ntransfer learning in NLP, showcasing its potential to excel in sentiment\nanalysis tasks. However, the research calls for a cautious interpretation of\nperfect accuracy and emphasizes the need for additional measures to validate\nthe model's generalization.\n","authors":["Aman Yadav","Abhishek Vichare"],"pdf_url":"https://arxiv.org/pdf/2311.16965v1.pdf","comment":"12 pages, 1 table, 4 figures"},{"id":"http://arxiv.org/abs/2311.16941v1","updated":"2023-11-28T16:46:14Z","published":"2023-11-28T16:46:14Z","title":"Debiasing Multimodal Models via Causal Information Minimization","summary":" Most existing debiasing methods for multimodal models, including causal\nintervention and inference methods, utilize approximate heuristics to represent\nthe biases, such as shallow features from early stages of training or unimodal\nfeatures for multimodal tasks like VQA, etc., which may not be accurate. In\nthis paper, we study bias arising from confounders in a causal graph for\nmultimodal data and examine a novel approach that leverages causally-motivated\ninformation minimization to learn the confounder representations. Robust\npredictive features contain diverse information that helps a model generalize\nto out-of-distribution data. Hence, minimizing the information content of\nfeatures obtained from a pretrained biased model helps learn the simplest\npredictive features that capture the underlying data distribution. We treat\nthese features as confounder representations and use them via methods motivated\nby causal theory to remove bias from models. We find that the learned\nconfounder representations indeed capture dataset biases, and the proposed\ndebiasing methods improve out-of-distribution (OOD) performance on multiple\nmultimodal datasets without sacrificing in-distribution performance.\nAdditionally, we introduce a novel metric to quantify the sufficiency of\nspurious features in models' predictions that further demonstrates the\neffectiveness of our proposed methods. Our code is available at:\nhttps://github.com/Vaidehi99/CausalInfoMin\n","authors":["Vaidehi Patil","Adyasha Maharana","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2311.16941v1.pdf","comment":"EMNLP 2023 Findings (16 pages)"},{"id":"http://arxiv.org/abs/2311.16922v1","updated":"2023-11-28T16:26:35Z","published":"2023-11-28T16:26:35Z","title":"Mitigating Object Hallucinations in Large Vision-Language Models through\n Visual Contrastive Decoding","summary":" Large Vision-Language Models (LVLMs) have advanced considerably, intertwining\nvisual recognition and language understanding to generate content that is not\nonly coherent but also contextually attuned. Despite their success, LVLMs still\nsuffer from the issue of object hallucinations, where models generate plausible\nyet incorrect outputs that include objects that do not exist in the images. To\nmitigate this issue, we introduce Visual Contrastive Decoding (VCD), a simple\nand training-free method that contrasts output distributions derived from\noriginal and distorted visual inputs. The proposed VCD effectively reduces the\nover-reliance on statistical bias and unimodal priors, two essential causes of\nobject hallucinations. This adjustment ensures the generated content is closely\ngrounded to visual inputs, resulting in contextually accurate outputs. Our\nexperiments show that VCD, without either additional training or the usage of\nexternal tools, significantly mitigates the object hallucination issue across\ndifferent LVLM families. Beyond mitigating object hallucinations, VCD also\nexcels in general LVLM benchmarks, highlighting its wide-ranging applicability.\n","authors":["Sicong Leng","Hang Zhang","Guanzheng Chen","Xin Li","Shijian Lu","Chunyan Miao","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2311.16922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08659v4","updated":"2023-11-28T16:06:59Z","published":"2023-10-12T18:34:08Z","title":"LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models","summary":" Quantization is an indispensable technique for serving Large Language Models\n(LLMs) and has recently found its way into LoRA fine-tuning. In this work we\nfocus on the scenario where quantization and LoRA fine-tuning are applied\ntogether on a pre-trained model. In such cases it is common to observe a\nconsistent gap in the performance on downstream tasks between full fine-tuning\nand quantization plus LoRA fine-tuning approach. In response, we propose LoftQ\n(LoRA-Fine-Tuning-aware Quantization), a novel quantization framework that\nsimultaneously quantizes an LLM and finds a proper low-rank initialization for\nLoRA fine-tuning. Such an initialization alleviates the discrepancy between the\nquantized and full-precision model and significantly improves generalization in\ndownstream tasks. We evaluate our method on natural language understanding,\nquestion answering, summarization, and natural language generation tasks.\nExperiments show that our method is highly effective and outperforms existing\nquantization methods, especially in the challenging 2-bit and 2/4-bit mixed\nprecision regimes. The code is available on https://github.com/yxli2123/LoftQ.\n","authors":["Yixiao Li","Yifan Yu","Chen Liang","Pengcheng He","Nikos Karampatziakis","Weizhu Chen","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08659v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06627v3","updated":"2023-11-28T15:57:16Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16882v1","updated":"2023-11-28T15:31:11Z","published":"2023-11-28T15:31:11Z","title":"Optimisation-Based Multi-Modal Semantic Image Editing","summary":" Image editing affords increased control over the aesthetics and content of\ngenerated images. Pre-existing works focus predominantly on text-based\ninstructions to achieve desired image modifications, which limit edit precision\nand accuracy. In this work, we propose an inference-time editing optimisation,\ndesigned to extend beyond textual edits to accommodate multiple editing\ninstruction types (e.g. spatial layout-based; pose, scribbles, edge maps). We\npropose to disentangle the editing task into two competing subtasks: successful\nlocal image modifications and global content consistency preservation, where\nsubtasks are guided through two dedicated loss functions. By allowing to adjust\nthe influence of each loss function, we build a flexible editing solution that\ncan be adjusted to user preferences. We evaluate our method using text, pose\nand scribble edit conditions, and highlight our ability to achieve complex\nedits, through both qualitative and quantitative experiments.\n","authors":["Bowen Li","Yongxin Yang","Steven McDonagh","Shifeng Zhang","Petru-Daniel Tudosiu","Sarah Parisot"],"pdf_url":"https://arxiv.org/pdf/2311.16882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16867v1","updated":"2023-11-28T15:12:47Z","published":"2023-11-28T15:12:47Z","title":"The Falcon Series of Open Language Models","summary":" We introduce the Falcon series: 7B, 40B, and 180B parameters causal\ndecoder-only models trained on a diverse high-quality corpora predominantly\nassembled from web data. The largest model, Falcon-180B, has been trained on\nover 3.5 trillion tokens of text--the largest openly documented pretraining\nrun. Falcon-180B significantly outperforms models such as PaLM or Chinchilla,\nand improves upon concurrently developed models such as LLaMA 2 or\nInflection-1. It nears the performance of PaLM-2-Large at a reduced pretraining\nand inference cost, making it, to our knowledge, one of the three best language\nmodels in the world along with GPT-4 and PaLM-2-Large. We report detailed\nevaluations, as well as a deep dive into the methods and custom tooling\nemployed to pretrain Falcon. Notably, we report on our custom distributed\ntraining codebase, allowing us to efficiently pretrain these models on up to\n4,096 A100s on cloud AWS infrastructure with limited interconnect. We release a\n600B tokens extract of our web dataset, as well as the Falcon-7/40/180B models\nunder a permissive license to foster open-science and accelerate the\ndevelopment of an open ecosystem of large language models.\n","authors":["Ebtesam Almazrouei","Hamza Alobeidli","Abdulaziz Alshamsi","Alessandro Cappelli","Ruxandra Cojocaru","Daniel Hesslow","Julien Launay","Quentin Malartic","Daniele Mazzotta","Badreddine Noune","Baptiste Pannier","Guilherme Penedo"],"pdf_url":"https://arxiv.org/pdf/2311.16867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16865v1","updated":"2023-11-28T15:12:11Z","published":"2023-11-28T15:12:11Z","title":"A Benchmark for Evaluating Machine Translation Metrics on Dialects\n Without Standard Orthography","summary":" For sensible progress in natural language processing, it is important that we\nare aware of the limitations of the evaluation metrics we use. In this work, we\nevaluate how robust metrics are to non-standardized dialects, i.e. spelling\ndifferences in language varieties that do not have a standard orthography. To\ninvestigate this, we collect a dataset of human translations and human\njudgments for automatic machine translations from English to two Swiss German\ndialects. We further create a challenge set for dialect variation and benchmark\nexisting metrics' performances. Our results show that existing metrics cannot\nreliably evaluate Swiss German text generation outputs, especially on segment\nlevel. We propose initial design adaptations that increase robustness in the\nface of non-standardized dialects, although there remains much room for further\nimprovement. The dataset, code, and models are available here:\nhttps://github.com/textshuttle/dialect_eval\n","authors":["Noëmi Aepli","Chantal Amrhein","Florian Schottmann","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2311.16865v1.pdf","comment":"WMT 2023 Research Paper"},{"id":"http://arxiv.org/abs/2311.16842v1","updated":"2023-11-28T14:55:52Z","published":"2023-11-28T14:55:52Z","title":"RELIC: Investigating Large Language Model Responses using\n Self-Consistency","summary":" Large Language Models (LLMs) are notorious for blending fact with fiction and\ngenerating non-factual content, known as hallucinations. To tackle this\nchallenge, we propose an interactive system that helps users obtain insights\ninto the reliability of the generated text. Our approach is based on the idea\nthat the self-consistency of multiple samples generated by the same LLM relates\nto its confidence in individual claims in the generated texts. Using this idea,\nwe design RELIC, an interactive system that enables users to investigate and\nverify semantic-level variations in multiple long-form responses. This allows\nusers to recognize potentially inaccurate information in the generated text and\nmake necessary corrections. From a user study with ten participants, we\ndemonstrate that our approach helps users better verify the reliability of the\ngenerated text. We further summarize the design implications and lessons\nlearned from this research for inspiring future studies on reliable human-LLM\ninteractions.\n","authors":["Furui Cheng","Vilém Zouhar","Simran Arora","Mrinmaya Sachan","Hendrik Strobelt","Mennatallah El-Assady"],"pdf_url":"https://arxiv.org/pdf/2311.16842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16840v1","updated":"2023-11-28T14:55:22Z","published":"2023-11-28T14:55:22Z","title":"The Claire French Dialogue Dataset","summary":" We present the Claire French Dialogue Dataset (CFDD), a resource created by\nmembers of LINAGORA Labs in the context of the OpenLLM France initiative. CFDD\nis a corpus containing roughly 160 million words from transcripts and stage\nplays in French that we have assembled and publicly released in an effort to\nfurther the development of multilingual, open source language models. This\npaper describes the 24 individual corpora of which CFDD is composed and\nprovides links and citations to their original sources. It also provides our\nproposed breakdown of the full CFDD dataset into eight categories of subcorpora\nand describes the process we followed to standardize the format of the final\ndataset. We conclude with a discussion of similar work and future directions.\n","authors":["Julie Hunter","Jérôme Louradour","Virgile Rennard","Ismaïl Harrando","Guokan Shang","Jean-Pierre Lorré"],"pdf_url":"https://arxiv.org/pdf/2311.16840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16839v1","updated":"2023-11-28T14:54:37Z","published":"2023-11-28T14:54:37Z","title":"Beyond Hallucinations: Enhancing LVLMs through Hallucination-Aware\n Direct Preference Optimization","summary":" Multimodal large language models have made significant advancements in recent\nyears, yet they still suffer from a common issue known as the \"hallucination\nproblem\" where the models generate textual descriptions that contain inaccurate\nor non-existent content from the image. To address this issue, this paper\nintroduces a novel strategy: Hallucination-Aware Direct Preference Optimization\n(HA-DPO). Our approach treats the hallucination problem as a unique preference\nselection issue, where the model is trained to favor the non-hallucinating\nresponse when presented with two responses of the same image (one accurate and\none hallucinating). This paper also presents an efficient process for\nconstructing hallucination sample pairs to ensure high-quality,\nstyle-consistent pairs for stable HA-DPO training. We applied this strategy to\ntwo mainstream multimodal models, and the results showed a significant\nreduction in the hallucination problem and an enhancement in the models'\ngeneralization capabilities. With HA-DPO, the MiniGPT-4 model demonstrates\nsignificant advancements: POPE accuracy increases from 51.13% to 85.66% (34.5%\nabsolute improvement), and the MME score escalates from 968.58 to 1365.76 (41%\nrelative improvement). The code, models, and datasets will be made publicly\navailable.\n","authors":["Zhiyuan Zhao","Bin Wang","Linke Ouyang","Xiaoyi Dong","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2311.16839v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2311.16832v1","updated":"2023-11-28T14:49:23Z","published":"2023-11-28T14:49:23Z","title":"CharacterGLM: Customizing Chinese Conversational AI Characters with\n Large Language Models","summary":" In this paper, we present CharacterGLM, a series of models built upon\nChatGLM, with model sizes ranging from 6B to 66B parameters. Our CharacterGLM\nis designed for generating Character-based Dialogues (CharacterDial), which\naims to equip a conversational AI system with character customization for\nsatisfying people's inherent social desires and emotional needs. On top of\nCharacterGLM, we can customize various AI characters or social agents by\nconfiguring their attributes (identities, interests, viewpoints, experiences,\nachievements, social relationships, etc.) and behaviors (linguistic features,\nemotional expressions, interaction patterns, etc.). Our model outperforms most\nmainstream close-source large langauge models, including the GPT series,\nespecially in terms of consistency, human-likeness, and engagement according to\nmanual evaluations. We will release our 6B version of CharacterGLM and a subset\nof training data to facilitate further research development in the direction of\ncharacter-based dialogue generation.\n","authors":["Jinfeng Zhou","Zhuang Chen","Dazhen Wan","Bosi Wen","Yi Song","Jifan Yu","Yongkang Huang","Libiao Peng","Jiaming Yang","Xiyao Xiao","Sahand Sabour","Xiaohan Zhang","Wenjing Hou","Yijia Zhang","Yuxiao Dong","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2311.16832v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2112.03002v2","updated":"2023-11-28T14:37:45Z","published":"2021-11-13T06:59:27Z","title":"GraphPrompt: Graph-Based Prompt Templates for Biomedical Synonym\n Prediction","summary":" In the expansion of biomedical dataset, the same category may be labeled with\ndifferent terms, thus being tedious and onerous to curate these terms.\nTherefore, automatically mapping synonymous terms onto the ontologies is\ndesirable, which we name as biomedical synonym prediction task. Unlike\nbiomedical concept normalization (BCN), no clues from context can be used to\nenhance synonym prediction, making it essential to extract graph features from\nontology. We introduce an expert-curated dataset OBO-syn encompassing 70\ndifferent types of concepts and 2 million curated concept-term pairs for\nevaluating synonym prediction methods. We find BCN methods perform weakly on\nthis task for not making full use of graph information. Therefore, we propose\nGraphPrompt, a prompt-based learning approach that creates prompt templates\naccording to the graphs. GraphPrompt obtained 37.2\\% and 28.5\\% improvement on\nzero-shot and few-shot settings respectively, indicating the effectiveness of\nthese graph-based prompt templates. We envision that our method GraphPrompt and\nOBO-syn dataset can be broadly applied to graph-based NLP tasks, and serve as\nthe basis for analyzing diverse and accumulating biomedical data. All the data\nand codes are avalible at: https://github.com/HanwenXuTHU/GraphPrompt\n","authors":["Hanwen Xu","Jiayou Zhang","Zhirui Wang","Shizhuo Zhang","Megh Manoj Bhalerao","Yucong Liu","Dawei Zhu","Sheng Wang"],"pdf_url":"https://arxiv.org/pdf/2112.03002v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2311.16822v1","updated":"2023-11-28T14:36:43Z","published":"2023-11-28T14:36:43Z","title":"Large Language Models Suffer From Their Own Output: An Analysis of the\n Self-Consuming Training Loop","summary":" Large language models (LLM) have become state of the art in many benchmarks\nand conversational LLM applications like ChatGPT are now widely used by the\npublic. Those LLMs can be used to generate large amounts of content which is\nposted on the internet to various platforms. As LLMs are trained on datasets\nusually collected from the internet, this LLM-generated content might be used\nto train the next generation of LLMs. Therefore, a self-consuming training loop\nemerges in which new LLM generations are trained on the output from the\nprevious generations. We empirically study this self-consuming training loop\nusing a novel dataset to analytically and accurately measure quality and\ndiversity of generated outputs. We find that this self-consuming training loop\ninitially improves both quality and diversity. However, after a few generations\nthe output inevitably degenerates in diversity. We find that the rate of\ndegeneration depends on the proportion of real and generated data.\n","authors":["Martin Briesch","Dominik Sobania","Franz Rothlauf"],"pdf_url":"https://arxiv.org/pdf/2311.16822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16789v1","updated":"2023-11-28T13:51:32Z","published":"2023-11-28T13:51:32Z","title":"A Survey of the Evolution of Language Model-Based Dialogue Systems","summary":" Dialogue systems, including task-oriented_dialogue_system (TOD) and\nopen-domain_dialogue_system (ODD), have undergone significant transformations,\nwith language_models (LM) playing a central role. This survey delves into the\nhistorical trajectory of dialogue systems, elucidating their intricate\nrelationship with advancements in language models by categorizing this\nevolution into four distinct stages, each marked by pivotal LM breakthroughs:\n1) Early_Stage: characterized by statistical LMs, resulting in rule-based or\nmachine-learning-driven dialogue_systems; 2) Independent development of TOD and\nODD based on neural_language_models (NLM; e.g., LSTM and GRU), since NLMs lack\nintrinsic knowledge in their parameters; 3) fusion between different types of\ndialogue systems with the advert of pre-trained_language_models (PLMs),\nstarting from the fusion between four_sub-tasks_within_TOD, and then\nTOD_with_ODD; and 4) current LLM-based_dialogue_system, wherein LLMs can be\nused to conduct TOD and ODD seamlessly. Thus, our survey provides a\nchronological perspective aligned with LM breakthroughs, offering a\ncomprehensive review of state-of-the-art research outcomes. What's more, we\nfocus on emerging topics and discuss open challenges, providing valuable\ninsights into future directions for LLM-based_dialogue_systems. Through this\nexploration, we pave the way for a deeper_comprehension of the evolution,\nguiding future developments in LM-based dialogue_systems.\n","authors":["Hongru Wang","Lingzhi Wang","Yiming Du","Liang Chen","Jingyan Zhou","Yufei Wang","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2311.16789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16787v1","updated":"2023-11-28T13:50:50Z","published":"2023-11-28T13:50:50Z","title":"Evaluating Optimal Reference Translations","summary":" The overall translation quality reached by current machine translation (MT)\nsystems for high-resourced language pairs is remarkably good. Standard methods\nof evaluation are not suitable nor intended to uncover the many translation\nerrors and quality deficiencies that still persist. Furthermore, the quality of\nstandard reference translations is commonly questioned and comparable quality\nlevels have been reached by MT alone in several language pairs. Navigating\nfurther research in these high-resource settings is thus difficult. In this\narticle, we propose a methodology for creating more reliable document-level\nhuman reference translations, called \"optimal reference translations,\" with the\nsimple aim to raise the bar of what should be deemed \"human translation\nquality.\" We evaluate the obtained document-level optimal reference\ntranslations in comparison with \"standard\" ones, confirming a significant\nquality increase and also documenting the relationship between evaluation and\ntranslation editing.\n","authors":["Vilém Zouhar","Věra Kloudová","Martin Popel","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2311.16787v1.pdf","comment":"To appear in Natural Language Engineering 2024"},{"id":"http://arxiv.org/abs/2310.01837v2","updated":"2023-11-28T13:36:58Z","published":"2023-10-03T07:01:23Z","title":"Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation","summary":" Current AI-based methods do not provide comprehensible physical\ninterpretations of the utilized data, extracted features, and\npredictions/inference operations. As a result, deep learning models trained\nusing high-resolution satellite imagery lack transparency and explainability\nand can be merely seen as a black box, which limits their wide-level adoption.\nExperts need help understanding the complex behavior of AI models and the\nunderlying decision-making process. The explainable artificial intelligence\n(XAI) field is an emerging field providing means for robust, practical, and\ntrustworthy deployment of AI models. Several XAI techniques have been proposed\nfor image classification tasks, whereas the interpretation of image\nsegmentation remains largely unexplored. This paper offers to bridge this gap\nby adapting the recent XAI classification algorithms and making them usable for\nmuti-class image segmentation, where we mainly focus on buildings' segmentation\nfrom high-resolution satellite images. To benchmark and compare the performance\nof the proposed approaches, we introduce a new XAI evaluation methodology and\nmetric based on \"Entropy\" to measure the model uncertainty. Conventional XAI\nevaluation methods rely mainly on feeding area-of-interest regions from the\nimage back to the pre-trained (utility) model and then calculating the average\nchange in the probability of the target class. Those evaluation metrics lack\nthe needed robustness, and we show that using Entropy to monitor the model\nuncertainty in segmenting the pixels within the target class is more suitable.\nWe hope this work will pave the way for additional XAI research for image\nsegmentation and applications in the remote sensing discipline.\n","authors":["Abdul Karim Gizzini","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16764v1","updated":"2023-11-28T13:08:26Z","published":"2023-11-28T13:08:26Z","title":"Radiology-Aware Model-Based Evaluation Metric for Report Generation","summary":" We propose a new automated evaluation metric for machine-generated radiology\nreports using the successful COMET architecture adapted for the radiology\ndomain. We train and publish four medically-oriented model checkpoints,\nincluding one trained on RadGraph, a radiology knowledge graph. Our results\nshow that our metric correlates moderately to high with established metrics\nsuch as BERTscore, BLEU, and CheXbert scores. Furthermore, we demonstrate that\none of our checkpoints exhibits a high correlation with human judgment, as\nassessed using the publicly available annotations of six board-certified\nradiologists, using a set of 200 reports. We also performed our own analysis\ngathering annotations with two radiologists on a collection of 100 reports. The\nresults indicate the potential effectiveness of our method as a\nradiology-specific evaluation metric. The code, data, and model checkpoints to\nreproduce our findings will be publicly available.\n","authors":["Amos Calamida","Farhad Nooralahzadeh","Morteza Rohanian","Koji Fujimoto","Mizuho Nishio","Michael Krauthammer"],"pdf_url":"https://arxiv.org/pdf/2311.16764v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2307.06985v4","updated":"2023-11-28T12:59:20Z","published":"2023-07-13T17:25:28Z","title":"Patent Documents to Engineering Design Knowledge Graphs","summary":" Aimed at supporting knowledge-intensive tasks in the design process,\npopulating design knowledge from text documents involves the extraction of\ntriples - head entity :: relationship :: tail entity or h :: r :: t that could\nbe combined into a knowledge graph representation. As relationships are largely\nchosen from ontological or common-sense alternatives, knowledge graphs built\nusing these depict an approximation or restricted view of design knowledge,\nrather than what is explicated in text document. In this article, we present a\ndata-driven approach to identify and explicate facts (h :: r :: t) from\nsentences in patent documents. We create a dataset of 44,227 sentences and\nfacts, encompassing all patent classifications while also capturing the\nvariations among patent document sections. Using this dataset, we train taggers\nthat classify tokens to: 1) identify all entities (h) and relationships (r) and\n2) specific relationships (r) for a pair of entities (h :: ___ :: t). While\nthese taggers are built upon transformer-based sequence classification models,\nwe evaluate our proposed method against edge classification approaches that use\nlinear classifiers and graph neural networks, incorporating transformer-based\ntoken embeddings and linguistic features. The simplicity and coverage of the\nproposed method enable its application to patent documents at any scale and\nvariety. Upon deploying an open-source python package, we apply our method to\npatent documents related to fan systems. From the knowledge graphs thus\nextracted, we explain how facts could be generalised to domain ontologies as\nwell as be specified to subsystem levels. We also highlight the importance of\nknowledge graph representations by retrieving and explicating the knowledge of\nkey issues in fan systems, while holding a comparative discussion against\nopinions from ChatGPT.\n","authors":["L Siddharth","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2307.06985v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12399v2","updated":"2023-11-28T12:32:05Z","published":"2023-11-21T07:22:48Z","title":"A Survey of Graph Meets Large Language Model: Progress and Future\n Directions","summary":" Graph plays a significant role in representing and analyzing complex\nrelationships in real-world applications such as citation networks, social\nnetworks, and biological data. Recently, Large Language Models (LLMs), which\nhave achieved tremendous success in various domains, have also been leveraged\nin graph-related tasks to surpass traditional Graph Neural Networks (GNNs)\nbased methods and yield state-of-the-art performance. In this survey, we first\npresent a comprehensive review and analysis of existing methods that integrate\nLLMs with graphs. First of all, we propose a new taxonomy, which organizes\nexisting methods into three categories based on the role (i.e., enhancer,\npredictor, and alignment component) played by LLMs in graph-related tasks. Then\nwe systematically survey the representative methods along the three categories\nof the taxonomy. Finally, we discuss the remaining limitations of existing\nstudies and highlight promising avenues for future research. The relevant\npapers are summarized and will be consistently updated at:\nhttps://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.\n","authors":["Yuhan Li","Zhixun Li","Peisong Wang","Jia Li","Xiangguo Sun","Hong Cheng","Jeffrey Xu Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12399v2.pdf","comment":"Work in progress; 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.16733v1","updated":"2023-11-28T12:29:33Z","published":"2023-11-28T12:29:33Z","title":"LLMs for Science: Usage for Code Generation and Data Analysis","summary":" Large language models (LLMs) have been touted to enable increased\nproductivity in many areas of today's work life. Scientific research as an area\nof work is no exception: the potential of LLM-based tools to assist in the\ndaily work of scientists has become a highly discussed topic across\ndisciplines. However, we are only at the very onset of this subject of study.\nIt is still unclear how the potential of LLMs will materialise in research\npractice. With this study, we give first empirical evidence on the use of LLMs\nin the research process. We have investigated a set of use cases for LLM-based\ntools in scientific research, and conducted a first study to assess to which\ndegree current tools are helpful. In this paper we report specifically on use\ncases related to software engineering, such as generating application code and\ndeveloping scripts for data analytics. While we studied seemingly simple use\ncases, results across tools differ significantly. Our results highlight the\npromise of LLM-based tools in general, yet we also observe various issues,\nparticularly regarding the integrity of the output these tools provide.\n","authors":["Mohamed Nejjar","Luca Zacharias","Fabian Stiehle","Ingo Weber"],"pdf_url":"https://arxiv.org/pdf/2311.16733v1.pdf","comment":"Preprint; In Submission"},{"id":"http://arxiv.org/abs/2310.02071v4","updated":"2023-11-28T11:23:14Z","published":"2023-10-03T14:13:36Z","title":"Towards End-to-End Embodied Decision Making via Multi-modal Large\n Language Model: Explorations with GPT4-Vision and Beyond","summary":" In this study, we explore the potential of Multimodal Large Language Models\n(MLLMs) in improving embodied decision-making processes for agents. While Large\nLanguage Models (LLMs) have been widely used due to their advanced reasoning\nskills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual\nunderstanding and reasoning capabilities. We investigate whether\nstate-of-the-art MLLMs can handle embodied decision-making in an end-to-end\nmanner and whether collaborations between LLMs and MLLMs can enhance\ndecision-making. To address these questions, we introduce a new benchmark\ncalled PCA-EVAL, which evaluates embodied decision-making from the perspectives\nof Perception, Cognition, and Action. Additionally, we propose HOLMES, a\nmulti-agent cooperation framework that allows LLMs to leverage MLLMs and APIs\nto gather multimodal information for informed decision-making. We compare\nend-to-end embodied decision-making and HOLMES on our benchmark and find that\nthe GPT4-Vision model demonstrates strong end-to-end embodied decision-making\nabilities, outperforming GPT4-HOLMES in terms of average decision accuracy\n(+3%). However, this performance is exclusive to the latest GPT4-Vision model,\nsurpassing the open-source state-of-the-art MLLM by 26%. Our results indicate\nthat powerful MLLMs like GPT4-Vision hold promise for decision-making in\nembodied agents, offering new avenues for MLLM research. Code and data are open\nat https://github.com/pkunlp-icler/PCA-EVAL/.\n","authors":["Liang Chen","Yichi Zhang","Shuhuai Ren","Haozhe Zhao","Zefan Cai","Yuchi Wang","Peiyi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2310.02071v4.pdf","comment":"FMDM@NeurIPS2023, Code and data:\n https://github.com/pkunlp-icler/PCA-EVAL/"},{"id":"http://arxiv.org/abs/2311.16678v1","updated":"2023-11-28T10:50:00Z","published":"2023-11-28T10:50:00Z","title":"Entity-Aspect-Opinion-Sentiment Quadruple Extraction for Fine-grained\n Sentiment Analysis","summary":" Product reviews often contain a large number of implicit aspects and\nobject-attribute co-existence cases. Unfortunately, many existing studies in\nAspect-Based Sentiment Analysis (ABSA) have overlooked this issue, which can\nmake it difficult to extract opinions comprehensively and fairly. In this\npaper, we propose a new task called Entity-Aspect-Opinion-Sentiment Quadruple\nExtraction (EASQE), which aims to hierarchically decompose aspect terms into\nentities and aspects to avoid information loss, non-exclusive annotations, and\nopinion misunderstandings in ABSA tasks. To facilitate research in this new\ntask, we have constructed four datasets (Res14-EASQE, Res15-EASQE, Res16-EASQE,\nand Lap14-EASQE) based on the SemEval Restaurant and Laptop datasets. We have\nalso proposed a novel two-stage sequence-tagging based Trigger-Opinion\nframework as the baseline for the EASQE task. Empirical evaluations show that\nour Trigger-Opinion framework can generate satisfactory EASQE results and can\nalso be applied to other ABSA tasks, significantly outperforming\nstate-of-the-art methods. We have made the four datasets and source code of\nTrigger-Opinion publicly available to facilitate further research in this area.\n","authors":["Dan Ma","Jun Xu","Zongyu Wang","Xuezhi Cao","Yunsen Xian"],"pdf_url":"https://arxiv.org/pdf/2311.16678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16675v1","updated":"2023-11-28T10:42:35Z","published":"2023-11-28T10:42:35Z","title":"A Distribution-Based Threshold for Determining Sentence Similarity","summary":" We hereby present a solution to a semantic textual similarity (STS) problem\nin which it is necessary to match two sentences containing, as the only\ndistinguishing factor, highly specific information (such as names, addresses,\nidentification codes), and from which we need to derive a definition for when\nthey are similar and when they are not. The solution revolves around the use of\na neural network, based on the siamese architecture, to create the\ndistributions of the distances between similar and dissimilar pairs of\nsentences. The goal of these distributions is to find a discriminating factor,\nthat we call \"threshold\", which represents a well-defined quantity that can be\nused to distinguish vector distances of similar pairs from vector distances of\ndissimilar pairs in new predictions and later analyses. In addition, we\ndeveloped a way to score the predictions by combining attributes from both the\ndistributions' features and the way the distance function works. Finally, we\ngeneralize the results showing that they can be transferred to a wider range of\ndomains by applying the system discussed to a well-known and widely used\nbenchmark dataset for STS problems.\n","authors":["Gioele Cadamuro","Marco Gruppo"],"pdf_url":"https://arxiv.org/pdf/2311.16675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.02804v2","updated":"2023-11-28T10:38:19Z","published":"2022-10-06T10:30:53Z","title":"Just ClozE! A Novel Framework for Evaluating the Factual Consistency\n Faster in Abstractive Summarization","summary":" The issue of factual consistency in abstractive summarization has received\nextensive attention in recent years, and the evaluation of factual consistency\nbetween summary and document has become an important and urgent task. Most of\nthe current evaluation metrics are adopted from the question answering (QA) or\nnatural language inference (NLI) task. However, the application of QA-based\nmetrics is extremely time-consuming in practice while NLI-based metrics are\nlack of interpretability. In this paper, we propose a cloze-based evaluation\nframework called ClozE and show the great potential of the cloze-based metric.\nIt inherits strong interpretability from QA, while maintaining the speed of\nNLI- level reasoning. We demonstrate that ClozE can reduce the evaluation time\nby nearly 96% relative to QA-based metrics while retaining their\ninterpretability and performance through experiments on six human-annotated\ndatasets and a meta-evaluation benchmark GO FIGURE (Gabriel et al., 2021).\nFinally, we discuss three important facets of ClozE in practice, which further\nshows better overall performance of ClozE compared to other metrics.\n","authors":["Yiyang Li","Lei Li","Marina Litvak","Natalia Vanetik","Dingxin Hu","Yuze Li","Yanquan Zhou"],"pdf_url":"https://arxiv.org/pdf/2210.02804v2.pdf","comment":"The manuscript for JAIR"},{"id":"http://arxiv.org/abs/2310.08992v2","updated":"2023-11-28T10:32:19Z","published":"2023-10-13T10:17:48Z","title":"CodeChain: Towards Modular Code Generation Through Chain of\n Self-revisions with Representative Sub-modules","summary":" Large Language Models (LLMs) have already become quite proficient at solving\nsimpler programming tasks like those in HumanEval or MBPP benchmarks. However,\nsolving more complex and competitive programming tasks is still quite\nchallenging for these models - possibly due to their tendency to generate\nsolutions as monolithic code blocks instead of decomposing them into logical\nsub-tasks and sub-modules. On the other hand, experienced programmers\ninstinctively write modularized code with abstraction for solving complex\ntasks, often reusing previously developed modules. To address this gap, we\npropose CodeChain, a novel framework for inference that elicits modularized\ncode generation through a chain of self-revisions, each being guided by some\nrepresentative sub-modules generated in previous iterations. Concretely,\nCodeChain first instructs the LLM to generate modularized codes through\nchain-of-thought prompting. Then it applies a chain of self-revisions by\niterating the two steps: 1) extracting and clustering the generated sub-modules\nand selecting the cluster representatives as the more generic and re-usable\nimplementations, and 2) augmenting the original chain-of-thought prompt with\nthese selected module-implementations and instructing the LLM to re-generate\nnew modularized solutions. We find that by naturally encouraging the LLM to\nreuse the previously developed and verified sub-modules, CodeChain can\nsignificantly boost both modularity as well as correctness of the generated\nsolutions, achieving relative pass@1 improvements of 35% on APPS and 76% on\nCodeContests. It is shown to be effective on both OpenAI LLMs as well as\nopen-sourced LLMs like WizardCoder. We also conduct comprehensive ablation\nstudies with different methods of prompting, number of clusters, model sizes,\nprogram qualities, etc., to provide useful insights that underpin CodeChain's\nsuccess.\n","authors":["Hung Le","Hailin Chen","Amrita Saha","Akash Gokul","Doyen Sahoo","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2310.08992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16650v1","updated":"2023-11-28T10:02:08Z","published":"2023-11-28T10:02:08Z","title":"Text2Tree: Aligning Text Representation to the Label Tree Hierarchy for\n Imbalanced Medical Classification","summary":" Deep learning approaches exhibit promising performances on various text\ntasks. However, they are still struggling on medical text classification since\nsamples are often extremely imbalanced and scarce. Different from existing\nmainstream approaches that focus on supplementary semantics with external\nmedical information, this paper aims to rethink the data challenges in medical\ntexts and present a novel framework-agnostic algorithm called Text2Tree that\nonly utilizes internal label hierarchy in training deep learning models. We\nembed the ICD code tree structure of labels into cascade attention modules for\nlearning hierarchy-aware label representations. Two new learning schemes,\nSimilarity Surrogate Learning (SSL) and Dissimilarity Mixup Learning (DML), are\ndevised to boost text classification by reusing and distinguishing samples of\nother labels following the label representation hierarchy, respectively.\nExperiments on authoritative public datasets and real-world medical records\nshow that our approach stably achieves superior performances over classical and\nadvanced imbalanced classification methods.\n","authors":["Jiahuan Yan","Haojun Gao","Zhang Kai","Weize Liu","Danny Chen","Jian Wu","Jintai Chen"],"pdf_url":"https://arxiv.org/pdf/2311.16650v1.pdf","comment":"EMNLP 2023 Findings. Code: https://github.com/jyansir/Text2Tree"},{"id":"http://arxiv.org/abs/2311.05332v2","updated":"2023-11-28T09:47:57Z","published":"2023-11-09T12:58:37Z","title":"On the Road with GPT-4V(ision): Early Explorations of Visual-Language\n Model on Autonomous Driving","summary":" The pursuit of autonomous driving technology hinges on the sophisticated\nintegration of perception, decision-making, and control systems. Traditional\napproaches, both data-driven and rule-based, have been hindered by their\ninability to grasp the nuance of complex driving environments and the\nintentions of other road users. This has been a significant bottleneck,\nparticularly in the development of common sense reasoning and nuanced scene\nunderstanding necessary for safe and reliable autonomous driving. The advent of\nVisual Language Models (VLM) represents a novel frontier in realizing fully\nautonomous vehicle driving. This report provides an exhaustive evaluation of\nthe latest state-of-the-art VLM, GPT-4V(ision), and its application in\nautonomous driving scenarios. We explore the model's abilities to understand\nand reason about driving scenes, make decisions, and ultimately act in the\ncapacity of a driver. Our comprehensive tests span from basic scene recognition\nto complex causal reasoning and real-time decision-making under varying\nconditions. Our findings reveal that GPT-4V demonstrates superior performance\nin scene understanding and causal reasoning compared to existing autonomous\nsystems. It showcases the potential to handle out-of-distribution scenarios,\nrecognize intentions, and make informed decisions in real driving contexts.\nHowever, challenges remain, particularly in direction discernment, traffic\nlight recognition, vision grounding, and spatial reasoning tasks. These\nlimitations underscore the need for further research and development. Project\nis now available on GitHub for interested parties to access and utilize:\n\\url{https://github.com/PJLab-ADG/GPT4V-AD-Exploration}\n","authors":["Licheng Wen","Xuemeng Yang","Daocheng Fu","Xiaofeng Wang","Pinlong Cai","Xin Li","Tao Ma","Yingxuan Li","Linran Xu","Dengke Shang","Zheng Zhu","Shaoyan Sun","Yeqi Bai","Xinyu Cai","Min Dou","Shuanglu Hu","Botian Shi","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2311.05332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16639v1","updated":"2023-11-28T09:45:02Z","published":"2023-11-28T09:45:02Z","title":"Scaling Political Texts with ChatGPT","summary":" We use GPT-4 to obtain position estimates of political texts in continuous\nspaces. We develop and validate a new approach by positioning British party\nmanifestos on the economic, social, and immigration policy dimensions and\ntweets by members of the US Congress on the left-right ideological spectrum.\nFor the party manifestos, the correlation between the positions produced by\nGPT-4 and experts is 93% or higher, a performance similar to or better than\nthat obtained with crowdsourced position estimates. For individual tweets, the\npositions obtained with GPT-4 achieve a correlation of 91% with crowdsourced\nposition estimates. For senators of the 117th US Congress, the positions\nobtained with GPT-4 achieve a correlation of 97% with estimates based on roll\ncall votes and of 96% with those based on campaign funding. Correlations are\nalso substantial within party, indicating that position estimates produced with\nGPT-4 capture within-party differences between senators. Overall, using GPT-4\nfor ideological scaling is fast, cost-efficient, and reliable. This approach\nprovides a viable alternative to scaling by both expert raters and\ncrowdsourcing.\n","authors":["Gaël Le Mens","Aina Gallego"],"pdf_url":"https://arxiv.org/pdf/2311.16639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16620v1","updated":"2023-11-28T09:21:48Z","published":"2023-11-28T09:21:48Z","title":"On the Long Range Abilities of Transformers","summary":" Despite their dominance in modern DL and, especially, NLP domains,\ntransformer architectures exhibit sub-optimal performance on long-range tasks\ncompared to recent layers that are specifically designed for this purpose. In\nthis work, drawing inspiration from key attributes of long-range layers, such\nas state-space layers, linear RNN layers, and global convolution layers, we\ndemonstrate that minimal modifications to the transformer architecture can\nsignificantly enhance performance on the Long Range Arena (LRA) benchmark, thus\nnarrowing the gap with these specialized layers. We identify that two key\nprinciples for long-range tasks are (i) incorporating an inductive bias towards\nsmoothness, and (ii) locality. As we show, integrating these ideas into the\nattention mechanism improves results with a negligible amount of additional\ncomputation and without any additional trainable parameters. Our theory and\nexperiments also shed light on the reasons for the inferior performance of\ntransformers on long-range tasks and identify critical properties that are\nessential for successfully capturing long-range dependencies.\n","authors":["Itamar Zimerman","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2311.16620v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2302.04023v4","updated":"2023-11-28T09:01:12Z","published":"2023-02-08T12:35:34Z","title":"A Multitask, Multilingual, Multimodal Evaluation of ChatGPT on\n Reasoning, Hallucination, and Interactivity","summary":" This paper proposes a framework for quantitatively evaluating interactive\nLLMs such as ChatGPT using publicly available data sets. We carry out an\nextensive technical evaluation of ChatGPT using 23 data sets covering 8\ndifferent common NLP application tasks. We evaluate the multitask, multilingual\nand multi-modal aspects of ChatGPT based on these data sets and a newly\ndesigned multimodal dataset. We find that ChatGPT outperforms LLMs with\nzero-shot learning on most tasks and even outperforms fine-tuned models on some\ntasks. We find that it is better at understanding non-Latin script languages\nthan generating them. It is able to generate multimodal content from textual\nprompts, via an intermediate code generation step. Moreover, we find that\nChatGPT is 63.41% accurate on average in 10 different reasoning categories\nunder logical reasoning, non-textual reasoning, and commonsense reasoning,\nhence making it an unreliable reasoner. It is, for example, better at deductive\nthan inductive reasoning. ChatGPT suffers from hallucination problems like\nother LLMs and it generates more extrinsic hallucinations from its parametric\nmemory as it does not have access to an external knowledge base. Finally, the\ninteractive feature of ChatGPT enables human collaboration with the underlying\nLLM to improve its performance, i.e, 8% ROUGE-1 on summarization and 2% ChrF++\non machine translation, in a multi-turn \"prompt engineering\" fashion. We also\nrelease codebase for evaluation set extraction.\n","authors":["Yejin Bang","Samuel Cahyawijaya","Nayeon Lee","Wenliang Dai","Dan Su","Bryan Wilie","Holy Lovenia","Ziwei Ji","Tiezheng Yu","Willy Chung","Quyet V. Do","Yan Xu","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2302.04023v4.pdf","comment":"45 pages, AACL 2023"},{"id":"http://arxiv.org/abs/2311.16588v1","updated":"2023-11-28T08:13:29Z","published":"2023-11-28T08:13:29Z","title":"MedGen: A Python Natural Language Processing Toolkit for Medical Text\n Processing","summary":" This study introduces MedGen, a comprehensive natural language processing\n(NLP) toolkit designed for medical text processing. MedGen is tailored for\nbiomedical researchers and healthcare professionals with an easy-to-use,\nall-in-one solution that requires minimal programming expertise. It includes\n(1) Generative Functions: For the first time, MedGen includes four advanced\ngenerative functions: question answering, text summarization, text\nsimplification, and machine translation; (2) Basic NLP Functions: MedGen\nintegrates 12 essential NLP functions such as word tokenization and sentence\nsegmentation; and (3) Query and Search Capabilities: MedGen provides\nuser-friendly query and search functions on text corpora. We fine-tuned 32\ndomain-specific language models, evaluated them thoroughly on 24 established\nbenchmarks and conducted manual reviews with clinicians. Additionally, we\nexpanded our toolkit by introducing query and search functions, while also\nstandardizing and integrating functions from third-party libraries. The\ntoolkit, its models, and associated data are publicly available via\nhttps://github.com/Yale-LILY/MedGen.\n","authors":["Rui Yang","Qingcheng Zeng","Keen You","Yujie Qiao","Lucas Huang","Chia-Chun Hsieh","Benjamin Rosand","Jeremy Goldwasser","Amisha D Dave","Tiarnan D. L. Keenan","Emily Y Chew","Dragomir Radev","Zhiyong Lu","Hua Xu","Qingyu Chen","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2311.16588v1.pdf","comment":"5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2310.00741v2","updated":"2023-11-28T08:06:53Z","published":"2023-10-01T17:37:31Z","title":"FELM: Benchmarking Factuality Evaluation of Large Language Models","summary":" Assessing factuality of text generated by large language models (LLMs) is an\nemerging yet crucial research area, aimed at alerting users to potential errors\nand guiding the development of more reliable LLMs. Nonetheless, the evaluators\nassessing factuality necessitate suitable evaluation themselves to gauge\nprogress and foster advancements. This direction remains under-explored,\nresulting in substantial impediments to the progress of factuality evaluators.\nTo mitigate this issue, we introduce a benchmark for Factuality Evaluation of\nlarge Language Models, referred to as felm. In this benchmark, we collect\nresponses generated from LLMs and annotate factuality labels in a fine-grained\nmanner. Contrary to previous studies that primarily concentrate on the\nfactuality of world knowledge (e.g.~information from Wikipedia), felm focuses\non factuality across diverse domains, spanning from world knowledge to math and\nreasoning. Our annotation is based on text segments, which can help pinpoint\nspecific factual errors. The factuality annotations are further supplemented by\npredefined error types and reference links that either support or contradict\nthe statement. In our experiments, we investigate the performance of several\nLLM-based factuality evaluators on felm, including both vanilla LLMs and those\naugmented with retrieval mechanisms and chain-of-thought processes. Our\nfindings reveal that while retrieval aids factuality evaluation, current LLMs\nare far from satisfactory to faithfully detect factual errors.\n","authors":["Shiqi Chen","Yiran Zhao","Jinghan Zhang","I-Chun Chern","Siyang Gao","Pengfei Liu","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2310.00741v2.pdf","comment":"Accepted by NeurIPS 2023 Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2311.16579v1","updated":"2023-11-28T07:47:25Z","published":"2023-11-28T07:47:25Z","title":"Recognizing Conditional Causal Relationships about Emotions and Their\n Corresponding Conditions","summary":" The study of causal relationships between emotions and causes in texts has\nrecently received much attention. Most works focus on extracting causally\nrelated clauses from documents. However, none of these works has considered\nthat the causal relationships among the extracted emotion and cause clauses can\nonly be valid under some specific context clauses. To highlight the context in\nsuch special causal relationships, we propose a new task to determine whether\nor not an input pair of emotion and cause has a valid causal relationship under\ndifferent contexts and extract the specific context clauses that participate in\nthe causal relationship. Since the task is new for which no existing dataset is\navailable, we conduct manual annotation on a benchmark dataset to obtain the\nlabels for our tasks and the annotations of each context clause's type that can\nalso be used in some other applications. We adopt negative sampling to\nconstruct the final dataset to balance the number of documents with and without\ncausal relationships. Based on the constructed dataset, we propose an\nend-to-end multi-task framework, where we design two novel and general modules\nto handle the two goals of our task. Specifically, we propose a context masking\nmodule to extract the context clauses participating in the causal\nrelationships. We propose a prediction aggregation module to fine-tune the\nprediction results according to whether the input emotion and causes depend on\nspecific context clauses. Results of extensive comparative experiments and\nablation studies demonstrate the effectiveness and generality of our proposed\nframework.\n","authors":["Xinhong Chen","Zongxi Li","Yaowei Wang","Haoran Xie","Jianping Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2311.16579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.04840v5","updated":"2023-11-28T06:39:41Z","published":"2021-08-10T18:00:14Z","title":"Post-hoc Interpretability for Neural NLP: A Survey","summary":" Neural networks for NLP are becoming increasingly complex and widespread, and\nthere is a growing concern if these models are responsible to use. Explaining\nmodels helps to address the safety and ethical concerns and is essential for\naccountability. Interpretability serves to provide these explanations in terms\nthat are understandable to humans. Additionally, post-hoc methods provide\nexplanations after a model is learned and are generally model-agnostic. This\nsurvey provides a categorization of how recent post-hoc interpretability\nmethods communicate explanations to humans, it discusses each method in-depth,\nand how they are validated, as the latter is often a common concern.\n","authors":["Andreas Madsen","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2108.04840v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14457v4","updated":"2023-11-28T05:54:11Z","published":"2023-05-23T18:28:42Z","title":"Pre-training Language Models for Comparative Reasoning","summary":" Comparative reasoning is a process of comparing objects, concepts, or\nentities to draw conclusions, which constitutes a fundamental cognitive\nability. In this paper, we propose a novel framework to pre-train language\nmodels for enhancing their abilities of comparative reasoning over texts. While\nthere have been approaches for NLP tasks that require comparative reasoning,\nthey suffer from costly manual data labeling and limited generalizability to\ndifferent tasks. Our approach introduces a novel method of collecting scalable\ndata for text-based entity comparison, which leverages both structured and\nunstructured data. Moreover, we present a framework of pre-training language\nmodels via three novel objectives on comparative reasoning. Evaluation on\ndownstream tasks including comparative question answering, question generation,\nand summarization shows that our pre-training framework significantly improves\nthe comparative reasoning abilities of language models, especially under\nlow-resource conditions. This work also releases the first integrated benchmark\nfor comparative reasoning.\n","authors":["Mengxia Yu","Zhihan Zhang","Wenhao Yu","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2305.14457v4.pdf","comment":"EMNLP 2023 - Camera Ready. Typos fixed"},{"id":"http://arxiv.org/abs/2311.04742v2","updated":"2023-11-28T05:25:45Z","published":"2023-11-08T15:11:57Z","title":"Using large language models to study human memory for meaningful\n narratives","summary":" One of the most impressive achievements of the AI revolution is the\ndevelopment of large language models that can generate meaningful text and\nrespond to instructions in plain English with no additional training necessary.\nHere we show that language models can be used as a scientific instrument for\nstudying human memory for meaningful material. We developed a pipeline for\ndesigning large scale memory experiments and analyzing the obtained results. We\nperformed online memory experiments with a large number of participants and\ncollected recognition and recall data for narratives of different lengths. We\nfound that both recall and recognition performance scale linearly with\nnarrative length. Furthermore, in order to investigate the role of narrative\ncomprehension in memory, we repeated these experiments using scrambled versions\nof the presented stories. We found that even though recall performance declined\nsignificantly, recognition remained largely unaffected. Interestingly, recalls\nin this condition seem to follow the original narrative order rather than the\nscrambled presentation, pointing to a contextual reconstruction of the story in\nmemory.\n","authors":["Antonios Georgiou","Tankut Can","Mikhail Katkov","Misha Tsodyks"],"pdf_url":"https://arxiv.org/pdf/2311.04742v2.pdf","comment":"v2: 43 pages, with added discussion and a new appendix C"},{"id":"http://arxiv.org/abs/2310.20246v4","updated":"2023-11-28T05:25:14Z","published":"2023-10-31T08:09:20Z","title":"Breaking Language Barriers in Multilingual Mathematical Reasoning:\n Insights and Observations","summary":" Existing research predominantly focuses on developing powerful language\nlearning models (LLMs) for mathematical reasoning within monolingual languages,\nwith few explorations in preserving efficacy in a multilingual context. To\nbridge this gap, this paper pioneers exploring and training powerful\nMultilingual Math Reasoning (xMR) LLMs. Firstly, by utilizing translation, we\nconstruct the first multilingual math reasoning instruction dataset,\nMGSM8KInstruct, encompassing ten distinct languages, thus addressing the issue\nof training data scarcity in xMR tasks. Based on the collected dataset, we\npropose different training strategies to build powerful xMR LLMs, named\nMathOctopus, notably outperform conventional open-source LLMs and exhibit\nsuperiority over ChatGPT in few-shot scenarios. Notably, MathOctopus-13B\nreaches 47.6% accuracy which exceeds ChatGPT 46.3% on MGSM testset. Beyond\nremarkable results, we unearth several pivotal observations and insights from\nextensive experiments: (1) When extending the rejection sampling strategy to\nthe multilingual context, it proves effective for model performances, albeit\nlimited. (2) Employing parallel corpora for math Supervised Fine-Tuning (SFT)\nacross multiple languages not only significantly enhances model performance\nmultilingually but also elevates their monolingual performance. This indicates\nthat crafting multilingual corpora can be regarded as a vital strategy for\nenhancing model performance in a specific language, especially in mathematical\nreasoning tasks. For instance, MathOctopus-7B improves its counterparts that\ntrained on English from 42.2% to 50.8% on GSM8K testset.\n","authors":["Nuo Chen","Zinan Zheng","Ning Wu","Ming Gong","Yangqiu Song","Dongmei Zhang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2310.20246v4.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2310.08559v2","updated":"2023-11-28T05:24:30Z","published":"2023-10-12T17:51:10Z","title":"Phenomenal Yet Puzzling: Testing Inductive Reasoning Capabilities of\n Language Models with Hypothesis Refinement","summary":" The ability to derive underlying principles from a handful of observations\nand then generalize to novel situations -- known as inductive reasoning -- is\ncentral to human intelligence. Prior work suggests that language models (LMs)\noften fall short on inductive reasoning, despite achieving impressive success\non research benchmarks. In this work, we conduct a systematic study of the\ninductive reasoning capabilities of LMs through iterative hypothesis\nrefinement, a technique that more closely mirrors the human inductive process\nthan standard input-output prompting. Iterative hypothesis refinement employs a\nthree-step process: proposing, selecting, and refining hypotheses in the form\nof textual rules. By examining the intermediate rules, we observe that LMs are\nphenomenal hypothesis proposers (i.e., generating candidate rules), and when\ncoupled with a (task-specific) symbolic interpreter that is able to\nsystematically filter the proposed set of rules, this hybrid approach achieves\nstrong results across inductive reasoning benchmarks that require inducing\ncausal relations, language-like instructions, and symbolic concepts. However,\nthey also behave as puzzling inductive reasoners, showing notable performance\ngaps between rule induction (i.e., identifying plausible rules) and rule\napplication (i.e., applying proposed rules to instances), suggesting that LMs\nare proposing hypotheses without being able to actually apply the rules.\nThrough empirical and human analyses, we further reveal several discrepancies\nbetween the inductive reasoning processes of LMs and humans, shedding light on\nboth the potentials and limitations of using LMs in inductive reasoning tasks.\n","authors":["Linlu Qiu","Liwei Jiang","Ximing Lu","Melanie Sclar","Valentina Pyatkin","Chandra Bhagavatula","Bailin Wang","Yoon Kim","Yejin Choi","Nouha Dziri","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2310.08559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16522v1","updated":"2023-11-28T05:00:27Z","published":"2023-11-28T05:00:27Z","title":"Evaluation of dynamic characteristics of power grid based on GNN and\n application on knowledge graph","summary":" A novel method for detecting faults in power grids using a graph neural\nnetwork (GNN) has been developed, aimed at enhancing intelligent fault\ndiagnosis in network operation and maintenance. This GNN-based approach\nidentifies faulty nodes within the power grid through a specialized electrical\nfeature extraction model coupled with a knowledge graph. Incorporating temporal\ndata, the method leverages the status of nodes from preceding and subsequent\ntime periods to aid in current fault detection. To validate the effectiveness\nof this GNN in extracting node features, a correlation analysis of the output\nfeatures from each node within the neural network layer was conducted. The\nresults from experiments show that this method can accurately locate fault\nnodes in simulated scenarios with a remarkable 99.53% accuracy. Additionally,\nthe graph neural network's feature modeling allows for a qualitative\nexamination of how faults spread across nodes, providing valuable insights for\nanalyzing fault nodes.\n","authors":["Hao Pei","Si Lin","Chuanfu Li","Che Wang","Haoming Chen","Sizhe Li"],"pdf_url":"https://arxiv.org/pdf/2311.16522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16509v1","updated":"2023-11-28T04:49:17Z","published":"2023-11-28T04:49:17Z","title":"StyleCap: Automatic Speaking-Style Captioning from Speech Based on\n Speech and Language Self-supervised Learning Models","summary":" We propose StyleCap, a method to generate natural language descriptions of\nspeaking styles appearing in speech. Although most of conventional techniques\nfor para-/non-linguistic information recognition focus on the category\nclassification or the intensity estimation of pre-defined labels, they cannot\nprovide the reasoning of the recognition result in an interpretable manner. As\na first step towards an end-to-end method for generating speaking-style prompts\nfrom speech, i.e., automatic speaking-style captioning, StyleCap uses paired\ndata of speech and natural language descriptions to train neural networks that\npredict prefix vectors fed into a large language model (LLM)-based text decoder\nfrom a speech representation vector. We explore an appropriate text decoder and\nspeech feature representation suitable for this new task. The experimental\nresults demonstrate that our StyleCap leveraging richer LLMs for the text\ndecoder, speech self-supervised learning (SSL) features, and sentence\nrephrasing augmentation improves the accuracy and diversity of generated\nspeaking-style captions. Samples of speaking-style captions generated by our\nStyleCap are publicly available.\n","authors":["Kazuki Yamauchi","Yusuke Ijima","Yuki Saito"],"pdf_url":"https://arxiv.org/pdf/2311.16509v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2311.16466v1","updated":"2023-11-28T04:07:34Z","published":"2023-11-28T04:07:34Z","title":"Enhancing Human Persuasion With Large Language Models","summary":" Although large language models (LLMs) are reshaping various aspects of human\nlife, our current understanding of their impacts remains somewhat constrained.\nHere we investigate the impact of LLMs on human communication, in the context\nof consumer complaints in the financial industry. Employing an AI detection\ntool on more than 780K complaints gathered by the Consumer Financial Protection\nBureau (CFPB), we find evidence of LLM usage in the writing of complaints -\nshortly after the release of ChatGPT. Our analyses reveal that LLM usage is\npositively correlated with the likelihood of obtaining desirable outcomes\n(i.e., offer of relief from financial firms) and suggest that this positive\ncorrelation may be partly due to the linguistic features improved by LLMs. We\ntest this conjecture with a preregistered experiment, which reveals results\nconsistent with those from observational studies: Consumer complaints written\nwith ChatGPT for improved linguistic qualities were more likely to receive\nhypothetical relief offers than the original consumer complaints, demonstrating\nthe LLM's ability to enhance message persuasiveness in human communication.\nBeing some of the earliest empirical evidence on LLM usage for enhancing\npersuasion, our results highlight the transformative potential of LLMs in human\ncommunication.\n","authors":["Minkyu Shin","Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2311.16466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03211v2","updated":"2023-11-28T03:50:54Z","published":"2023-10-04T23:33:36Z","title":"On the Performance of Multimodal Language Models","summary":" Instruction-tuned large language models (LLMs) have demonstrated promising\nzero-shot generalization capabilities across various downstream tasks. Recent\nresearch has introduced multimodal capabilities to LLMs by integrating\nindependently pretrained vision encoders through model grafting. These\nmultimodal variants undergo instruction tuning, similar to LLMs, enabling\neffective zero-shot generalization for multimodal tasks. This study conducts a\ncomparative analysis of different multimodal instruction tuning approaches and\nevaluates their performance across a range of tasks, including complex\nreasoning, conversation, image captioning, multiple-choice questions (MCQs),\nand binary classification. Through rigorous benchmarking and ablation\nexperiments, we reveal key insights for guiding architectural choices when\nincorporating multimodal capabilities into LLMs. However, current approaches\nhave limitations; they do not sufficiently address the need for a diverse\nmultimodal instruction dataset, which is crucial for enhancing task\ngeneralization. Additionally, they overlook issues related to truthfulness and\nfactuality when generating responses. These findings illuminate current\nmethodological constraints in adapting language models for image comprehension\nand provide valuable guidance for researchers and practitioners seeking to\nharness multimodal versions of LLMs.\n","authors":["Utsav Garg","Erhan Bas"],"pdf_url":"https://arxiv.org/pdf/2310.03211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16452v1","updated":"2023-11-28T03:16:12Z","published":"2023-11-28T03:16:12Z","title":"Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case\n Study in Medicine","summary":" Generalist foundation models such as GPT-4 have displayed surprising\ncapabilities in a wide variety of domains and tasks. Yet, there is a prevalent\nassumption that they cannot match specialist capabilities of fine-tuned models.\nFor example, most explorations to date on medical competency benchmarks have\nleveraged domain-specific training, as exemplified by efforts on BioGPT and\nMed-PaLM. We build on a prior study of GPT-4's capabilities on medical\nchallenge benchmarks in the absence of special training. Rather than using\nsimple prompting to highlight the model's out-of-the-box capabilities, we\nperform a systematic exploration of prompt engineering. We find that prompting\ninnovation can unlock deeper specialist capabilities and show that GPT-4 easily\ntops prior leading results for medical benchmarks. The prompting methods we\nexplore are general purpose, and make no specific use of domain expertise,\nremoving the need for expert-curated content. Our experimental design carefully\ncontrols for overfitting during the prompt engineering process. We introduce\nMedprompt, based on a composition of several prompting strategies. With\nMedprompt, GPT-4 achieves state-of-the-art results on all nine of the benchmark\ndatasets in the MultiMedQA suite. The method outperforms leading specialist\nmodels such as Med-PaLM 2 by a significant margin with an order of magnitude\nfewer calls to the model. Steering GPT-4 with Medprompt achieves a 27%\nreduction in error rate on the MedQA dataset over the best methods to date\nachieved with specialist models and surpasses a score of 90% for the first\ntime. Beyond medical problems, we show the power of Medprompt to generalize to\nother domains and provide evidence for the broad applicability of the approach\nvia studies of the strategy on exams in electrical engineering, machine\nlearning, philosophy, accounting, law, nursing, and clinical psychology.\n","authors":["Harsha Nori","Yin Tat Lee","Sheng Zhang","Dean Carignan","Richard Edgar","Nicolo Fusi","Nicholas King","Jonathan Larson","Yuanzhi Li","Weishung Liu","Renqian Luo","Scott Mayer McKinney","Robert Osazuwa Ness","Hoifung Poon","Tao Qin","Naoto Usuyama","Chris White","Eric Horvitz"],"pdf_url":"https://arxiv.org/pdf/2311.16452v1.pdf","comment":"21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.16444v1","updated":"2023-11-28T02:51:13Z","published":"2023-11-28T02:51:13Z","title":"Exo2EgoDVC: Dense Video Captioning of Egocentric Procedural Activities\n Using Web Instructional Videos","summary":" We propose a novel benchmark for cross-view knowledge transfer of dense video\ncaptioning, adapting models from web instructional videos with exocentric views\nto an egocentric view. While dense video captioning (predicting time segments\nand their captions) is primarily studied with exocentric videos (e.g.,\nYouCook2), benchmarks with egocentric videos are restricted due to data\nscarcity. To overcome the limited video availability, transferring knowledge\nfrom abundant exocentric web videos is demanded as a practical approach.\nHowever, learning the correspondence between exocentric and egocentric views is\ndifficult due to their dynamic view changes. The web videos contain mixed views\nfocusing on either human body actions or close-up hand-object interactions,\nwhile the egocentric view is constantly shifting as the camera wearer moves.\nThis necessitates the in-depth study of cross-view transfer under complex view\nchanges. In this work, we first create a real-life egocentric dataset (EgoYC2)\nwhose captions are shared with YouCook2, enabling transfer learning between\nthese datasets assuming their ground-truth is accessible. To bridge the view\ngaps, we propose a view-invariant learning method using adversarial training in\nboth the pre-training and fine-tuning stages. While the pre-training is\ndesigned to learn invariant features against the mixed views in the web videos,\nthe view-invariant fine-tuning further mitigates the view gaps between both\ndatasets. We validate our proposed method by studying how effectively it\novercomes the view change problem and efficiently transfers the knowledge to\nthe egocentric domain. Our benchmark pushes the study of the cross-view\ntransfer into a new task domain of dense video captioning and will envision\nmethodologies to describe egocentric videos in natural language.\n","authors":["Takehiko Ohkawa","Takuma Yagi","Taichi Nishimura","Ryosuke Furuta","Atsushi Hashimoto","Yoshitaka Ushiku","Yoichi Sato"],"pdf_url":"https://arxiv.org/pdf/2311.16444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15544v2","updated":"2023-11-28T02:04:58Z","published":"2023-11-27T05:20:47Z","title":"The effect of source disclosure on evaluation of AI-generated messages:\n A two-part study","summary":" Advancements in artificial intelligence (AI) over the last decade demonstrate\nthat machines can exhibit communicative behavior and influence how humans\nthink, feel, and behave. In fact, the recent development of ChatGPT has shown\nthat large language models (LLMs) can be leveraged to generate high-quality\ncommunication content at scale and across domains, suggesting that they will be\nincreasingly used in practice. However, many questions remain about how knowing\nthe source of the messages influences recipients' evaluation of and preference\nfor AI-generated messages compared to human-generated messages. This paper\ninvestigated this topic in the context of vaping prevention messaging. In Study\n1, which was pre-registered, we examined the influence of source disclosure on\npeople's evaluation of AI-generated health prevention messages compared to\nhuman-generated messages. We found that source disclosure (i.e., labeling the\nsource of a message as AI vs. human) significantly impacted the evaluation of\nthe messages but did not significantly alter message rankings. In a follow-up\nstudy (Study 2), we examined how the influence of source disclosure may vary by\nthe participants' negative attitudes towards AI. We found a significant\nmoderating effect of negative attitudes towards AI on message evaluation, but\nnot for message selection. However, for those with moderate levels of negative\nattitudes towards AI, source disclosure decreased the preference for\nAI-generated messages. Overall, the results of this series of studies showed a\nslight bias against AI-generated messages once the source was disclosed, adding\nto the emerging area of study that lies at the intersection of AI and\ncommunication.\n","authors":["Sue Lim","Ralf Schmälzle"],"pdf_url":"https://arxiv.org/pdf/2311.15544v2.pdf","comment":"Manuscript currently under review. Paper presented at 109th Annual\n National Communication Association (NCA) Conference, November 16-19, 2023. 10\n pages, 5 figures. Supplementary file formatting updated in current version"},{"id":"http://arxiv.org/abs/2311.16421v1","updated":"2023-11-28T02:01:25Z","published":"2023-11-28T02:01:25Z","title":"CDEval: A Benchmark for Measuring the Cultural Dimensions of Large\n Language Models","summary":" As the scaling of Large Language Models (LLMs) has dramatically enhanced\ntheir capabilities, there has been a growing focus on the alignment problem to\nensure their responsible and ethical use. While existing alignment efforts\npredominantly concentrate on universal values such as the HHH principle, the\naspect of culture, which is inherently pluralistic and diverse, has not\nreceived adequate attention. This work introduces a new benchmark, CDEval,\naimed at evaluating the cultural dimensions of LLMs. CDEval is constructed by\nincorporating both GPT-4's automated generation and human verification,\ncovering six cultural dimensions across seven domains. Our comprehensive\nexperiments provide intriguing insights into the culture of mainstream LLMs,\nhighlighting both consistencies and variations across different dimensions and\ndomains. The findings underscore the importance of integrating cultural\nconsiderations in LLM development, particularly for applications in diverse\ncultural settings. Through CDEval, we aim to broaden the horizon of LLM\nalignment research by including cultural dimensions, thus providing a more\nholistic framework for the future development and evaluation of LLMs. This\nbenchmark serves as a valuable resource for cultural studies in LLMs, paving\nthe way for more culturally aware and sensitive models.\n","authors":["Yuhang Wang","Yanxu Zhu","Chao Kong","Shuyu Wei","Xiaoyuan Yi","Xing Xie","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2311.16421v1.pdf","comment":"Work in process"},{"id":"http://arxiv.org/abs/2309.02705v2","updated":"2023-11-28T01:56:17Z","published":"2023-09-06T04:37:20Z","title":"Certifying LLM Safety against Adversarial Prompting","summary":" Large language models (LLMs) released for public use incorporate guardrails\nto ensure their output is safe, often referred to as \"model alignment.\" An\naligned language model should decline a user's request to produce harmful\ncontent. However, such safety measures are vulnerable to adversarial attacks,\nwhich add maliciously designed token sequences to a harmful prompt to bypass\nthe model's safety guards. In this work, we introduce erase-and-check, the\nfirst framework to defend against adversarial prompts with verifiable safety\nguarantees. We defend against three attack modes: i) adversarial suffix, which\nappends an adversarial sequence at the end of the prompt; ii) adversarial\ninsertion, where the adversarial sequence is inserted anywhere in the middle of\nthe prompt; and iii) adversarial infusion, where adversarial tokens are\ninserted at arbitrary positions in the prompt, not necessarily as a contiguous\nblock. Our experimental results demonstrate that this procedure can obtain\nstrong certified safety guarantees on harmful prompts while maintaining good\nempirical performance on safe prompts. For example, against adversarial\nsuffixes of length 20, it certifiably detects 92% of harmful prompts and labels\n94% of safe prompts correctly using the open-source language model Llama 2 as\nthe safety filter. We further improve the filter's performance, in terms of\naccuracy and speed, by replacing Llama 2 with a DistilBERT safety classifier\nfine-tuned on safe and harmful prompts. Additionally, we propose two efficient\nempirical defenses: i) RandEC, a randomized version of erase-and-check that\nevaluates the safety filter on a small subset of the erased subsequences, and\nii) GradEC, a gradient-based version that optimizes the erased tokens to remove\nthe adversarial sequence. The code for our experiments is available at\nhttps://github.com/aounon/certified-llm-safety.\n","authors":["Aounon Kumar","Chirag Agarwal","Suraj Srinivas","Aaron Jiaxun Li","Soheil Feizi","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2309.02705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17280v1","updated":"2023-11-28T23:40:13Z","published":"2023-11-28T23:40:13Z","title":"Does VLN Pretraining Work with Nonsensical or Irrelevant Instructions?","summary":" Data augmentation via back-translation is common when pretraining\nVision-and-Language Navigation (VLN) models, even though the generated\ninstructions are noisy. But: does that noise matter? We find that nonsensical\nor irrelevant language instructions during pretraining can have little effect\non downstream performance for both HAMT and VLN-BERT on R2R, and is still\nbetter than only using clean, human data. To underscore these results, we\nconcoct an efficient augmentation method, Unigram + Object, which generates\nnonsensical instructions that nonetheless improve downstream performance. Our\nfindings suggest that what matters for VLN R2R pretraining is the quantity of\nvisual trajectories, not the quality of instructions.\n","authors":["Wang Zhu","Ishika Singh","Yuan Huang","Robin Jia","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2311.17280v1.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.17264v1","updated":"2023-11-28T22:54:33Z","published":"2023-11-28T22:54:33Z","title":"RETSim: Resilient and Efficient Text Similarity","summary":" This paper introduces RETSim (Resilient and Efficient Text Similarity), a\nlightweight, multilingual deep learning model trained to produce robust metric\nembeddings for near-duplicate text retrieval, clustering, and dataset\ndeduplication tasks. We demonstrate that RETSim is significantly more robust\nand accurate than MinHash and neural text embeddings, achieving new\nstate-of-the-art performance on dataset deduplication, adversarial text\nretrieval benchmarks, and spam clustering tasks. We also introduce the W4NT3D\nbenchmark (Wiki-40B 4dversarial Near-T3xt Dataset) for evaluating multilingual,\nnear-duplicate text retrieval capabilities under adversarial settings. RETSim\nand the W4NT3D benchmark are open-sourced under the MIT License at\nhttps://github.com/google/unisim.\n","authors":["Marina Zhang","Owen Vallis","Aysegul Bumin","Tanay Vakharia","Elie Bursztein"],"pdf_url":"https://arxiv.org/pdf/2311.17264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19103v2","updated":"2023-11-28T21:18:05Z","published":"2023-05-30T15:06:28Z","title":"Does Conceptual Representation Require Embodiment? Insights From Large\n Language Models","summary":" To what extent can language alone give rise to complex concepts, or is\nembodied experience essential? Recent advancements in large language models\n(LLMs) offer fresh perspectives on this question. Although LLMs are trained on\nrestricted modalities, they exhibit human-like performance in diverse\npsychological tasks. Our study compared representations of 4,442 lexical\nconcepts between humans and ChatGPTs (GPT-3.5 and GPT-4) across multiple\ndimensions, including five key domains: emotion, salience, mental\nvisualization, sensory, and motor experience. We identify two main findings: 1)\nBoth models strongly align with human representations in non-sensorimotor\ndomains but lag in sensory and motor areas, with GPT-4 outperforming GPT-3.5;\n2) GPT-4's gains are associated with its additional visual learning, which also\nappears to benefit related dimensions like haptics and imageability. These\nresults highlight the limitations of language in isolation, and that the\nintegration of diverse modalities of inputs leads to a more human-like\nconceptual representation.\n","authors":["Qihui Xu","Yingying Peng","Minghua Wu","Feng Xiao","Martin Chodorow","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2305.19103v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17233v1","updated":"2023-11-28T21:15:24Z","published":"2023-11-28T21:15:24Z","title":"Quantifying the redundancy between prosody and text","summary":" Prosody -- the suprasegmental component of speech, including pitch, loudness,\nand tempo -- carries critical aspects of meaning. However, the relationship\nbetween the information conveyed by prosody vs. by the words themselves remains\npoorly understood. We use large language models (LLMs) to estimate how much\ninformation is redundant between prosody and the words themselves. Using a\nlarge spoken corpus of English audiobooks, we extract prosodic features aligned\nto individual words and test how well they can be predicted from LLM\nembeddings, compared to non-contextual word embeddings. We find a high degree\nof redundancy between the information carried by the words and prosodic\ninformation across several prosodic features, including intensity, duration,\npauses, and pitch contours. Furthermore, a word's prosodic information is\nredundant with both the word itself and the context preceding as well as\nfollowing it. Still, we observe that prosodic features can not be fully\npredicted from text, suggesting that prosody carries information above and\nbeyond the words. Along with this paper, we release a general-purpose data\nprocessing pipeline for quantifying the relationship between linguistic\ninformation and extra-linguistic features.\n","authors":["Lukas Wolf","Tiago Pimentel","Evelina Fedorenko","Ryan Cotterell","Alex Warstadt","Ethan Wilcox","Tamar Regev"],"pdf_url":"https://arxiv.org/pdf/2311.17233v1.pdf","comment":"Published at The 2023 Conference on Empirical Methods in Natural\n Language Processing (EMNLP)"},{"id":"http://arxiv.org/abs/2311.14743v2","updated":"2023-11-28T21:04:36Z","published":"2023-11-21T18:41:26Z","title":"A Baseline Analysis of Reward Models' Ability To Accurately Analyze\n Foundation Models Under Distribution Shift","summary":" Foundation models, specifically Large Language Models (LLM's), have lately\ngained wide-spread attention and adoption. Reinforcement Learning with Human\nFeedback (RLHF) involves training a reward model to capture desired behaviors,\nwhich is then used to align an LLM. These reward models are additionally used\nat inference-time to estimate how well LLM responses adhere to those desired\nbehaviors. However, there is little work measuring how robust these reward\nmodels are to distribution shifts. In this work, we evaluate how reward model\nperformance - measured via accuracy and calibration (i.e. alignment between\naccuracy and confidence) - is affected by distribution shift. We show novel\ncalibration patterns and accuracy drops due to OOD prompts and responses, and\nthat the reward model is more sensitive to shifts in responses than prompts.\nAdditionally, we adapt an OOD detection technique commonly used in\nclassification to the reward model setting in order to detect these\ndistribution shifts in prompts and responses.\n","authors":["Ben Pikus","Will LeVine","Tony Chen","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2311.14743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17227v1","updated":"2023-11-28T20:59:49Z","published":"2023-11-28T20:59:49Z","title":"War and Peace (WarAgent): Large Language Model-based Multi-Agent\n Simulation of World Wars","summary":" Can we avoid wars at the crossroads of history? This question has been\npursued by individuals, scholars, policymakers, and organizations throughout\nhuman history. In this research, we attempt to answer the question based on the\nrecent advances of Artificial Intelligence (AI) and Large Language Models\n(LLMs). We propose \\textbf{WarAgent}, an LLM-powered multi-agent AI system, to\nsimulate the participating countries, their decisions, and the consequences, in\nhistorical international conflicts, including the World War I (WWI), the World\nWar II (WWII), and the Warring States Period (WSP) in Ancient China. By\nevaluating the simulation effectiveness, we examine the advancements and\nlimitations of cutting-edge AI systems' abilities in studying complex\ncollective human behaviors such as international conflicts under diverse\nsettings. In these simulations, the emergent interactions among agents also\noffer a novel perspective for examining the triggers and conditions that lead\nto war. Our findings offer data-driven and AI-augmented insights that can\nredefine how we approach conflict resolution and peacekeeping strategies. The\nimplications stretch beyond historical analysis, offering a blueprint for using\nAI to understand human history and possibly prevent future international\nconflicts. Code and data are available at\n\\url{https://github.com/agiresearch/WarAgent}.\n","authors":["Wenyue Hua","Lizhou Fan","Lingyao Li","Kai Mei","Jianchao Ji","Yingqiang Ge","Libby Hemphill","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.17227v1.pdf","comment":"40 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.14566v2","updated":"2023-11-28T20:56:41Z","published":"2023-10-23T04:49:09Z","title":"HallusionBench: An Advanced Diagnostic Suite for Entangled Language\n Hallucination & Visual Illusion in Large Vision-Language Models","summary":" We introduce HallusionBench, a comprehensive benchmark designed for the\nevaluation of image-context reasoning. This benchmark presents significant\nchallenges to advanced large visual-language models (LVLMs), such as\nGPT-4V(Vision) and LLaVA-1.5, by emphasizing nuanced understanding and\ninterpretation of visual data. The benchmark comprises 346 images paired with\n1129 questions, all meticulously crafted by human experts. We introduce a novel\nstructure for these visual questions designed to establish control groups. This\nstructure enables us to conduct a quantitative analysis of the models' response\ntendencies, logical consistency, and various failure modes. In our evaluation\non HallusionBench, we benchmarked 13 different models, highlighting a 31.42%\nquestion-pair accuracy achieved by the state-of-the-art GPT-4V. Notably, all\nother evaluated models achieve accuracy below 16%. Moreover, our analysis not\nonly highlights the observed failure modes, including language hallucination\nand visual illusion, but also deepens an understanding of these pitfalls. Our\ncomprehensive case studies within HallusionBench shed light on the challenges\nof hallucination and illusion in LVLMs. Based on these insights, we suggest\npotential pathways for their future improvement. The benchmark and codebase can\nbe accessed at https://github.com/tianyi-lab/HallusionBench.\n","authors":["Tianrui Guan","Fuxiao Liu","Xiyang Wu","Ruiqi Xian","Zongxia Li","Xiaoyu Liu","Xijun Wang","Lichang Chen","Furong Huang","Yaser Yacoob","Dinesh Manocha","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.14566v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17213v1","updated":"2023-11-28T20:34:40Z","published":"2023-11-28T20:34:40Z","title":"General-Purpose vs. Domain-Adapted Large Language Models for Extraction\n of Data from Thoracic Radiology Reports","summary":" Radiologists produce unstructured data that could be valuable for clinical\ncare when consumed by information systems. However, variability in style limits\nusage. Study compares performance of system using domain-adapted language model\n(RadLing) and general-purpose large language model (GPT-4) in extracting common\ndata elements (CDE) from thoracic radiology reports. Three radiologists\nannotated a retrospective dataset of 1300 thoracic reports (900 training, 400\ntest) and mapped to 21 pre-selected relevant CDEs. RadLing was used to generate\nembeddings for sentences and identify CDEs using cosine-similarity, which were\nmapped to values using light-weight mapper. GPT-4 system used OpenAI's\ngeneral-purpose embeddings to identify relevant CDEs and used GPT-4 to map to\nvalues. The output CDE:value pairs were compared to the reference standard; an\nidentical match was considered true positive. Precision (positive predictive\nvalue) was 96% (2700/2824) for RadLing and 99% (2034/2047) for GPT-4. Recall\n(sensitivity) was 94% (2700/2876) for RadLing and 70% (2034/2887) for GPT-4;\nthe difference was statistically significant (P<.001). RadLing's domain-adapted\nembeddings were more sensitive in CDE identification (95% vs 71%) and its\nlight-weight mapper had comparable precision in value assignment (95.4% vs\n95.0%). RadLing system exhibited higher performance than GPT-4 system in\nextracting CDEs from radiology reports. RadLing system's domain-adapted\nembeddings outperform general-purpose embeddings from OpenAI in CDE\nidentification and its light-weight value mapper achieves comparable precision\nto large GPT-4. RadLing system offers operational advantages including local\ndeployment and reduced runtime costs. Domain-adapted RadLing system surpasses\nGPT-4 system in extracting common data elements from radiology reports, while\nproviding benefits of local deployment and lower costs.\n","authors":["Ali H. Dhanaliwala","Rikhiya Ghosh","Sanjeev Kumar Karn","Poikavila Ullaskrishnan","Oladimeji Farri","Dorin Comaniciu","Charles E. Kahn"],"pdf_url":"https://arxiv.org/pdf/2311.17213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05052v2","updated":"2023-11-28T20:12:36Z","published":"2023-07-11T07:03:29Z","title":"Towards Understanding In-Context Learning with Contrastive\n Demonstrations and Saliency Maps","summary":" We investigate the role of various demonstration components in the in-context\nlearning (ICL) performance of large language models (LLMs). Specifically, we\nexplore the impacts of ground-truth labels, input distribution, and\ncomplementary explanations, particularly when these are altered or perturbed.\nWe build on previous work, which offers mixed findings on how these elements\ninfluence ICL. To probe these questions, we employ explainable NLP (XNLP)\nmethods and utilize saliency maps of contrastive demonstrations for both\nqualitative and quantitative analysis. Our findings reveal that flipping\nground-truth labels significantly affects the saliency, though it's more\nnoticeable in larger LLMs. Our analysis of the input distribution at a granular\nlevel reveals that changing sentiment-indicative terms in a sentiment analysis\ntask to neutral ones does not have as substantial an impact as altering\nground-truth labels. Finally, we find that the effectiveness of complementary\nexplanations in boosting ICL performance is task-dependent, with limited\nbenefits seen in sentiment analysis tasks compared to symbolic reasoning tasks.\nThese insights are critical for understanding the functionality of LLMs and\nguiding the development of effective demonstrations, which is increasingly\nrelevant in light of the growing use of LLMs in applications such as ChatGPT.\nOur research code is publicly available at https://github.com/paihengxu/XICL.\n","authors":["Paiheng Xu","Fuxiao Liu","Zongxia Li","Hyemi Song"],"pdf_url":"https://arxiv.org/pdf/2307.05052v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.12931v2","updated":"2023-11-28T19:06:49Z","published":"2023-09-22T15:30:53Z","title":"On Separate Normalization in Self-supervised Transformers","summary":" Self-supervised training methods for transformers have demonstrated\nremarkable performance across various domains. Previous transformer-based\nmodels, such as masked autoencoders (MAE), typically utilize a single\nnormalization layer for both the [CLS] symbol and the tokens. We propose in\nthis paper a simple modification that employs separate normalization layers for\nthe tokens and the [CLS] symbol to better capture their distinct\ncharacteristics and enhance downstream task performance. Our method aims to\nalleviate the potential negative effects of using the same normalization\nstatistics for both token types, which may not be optimally aligned with their\nindividual roles. We empirically show that by utilizing a separate\nnormalization layer, the [CLS] embeddings can better encode the global\ncontextual information and are distributed more uniformly in its anisotropic\nspace. When replacing the conventional normalization layer with the two\nseparate layers, we observe an average 2.7% performance improvement over the\nimage, natural language, and graph domains.\n","authors":["Xiaohui Chen","Yinkai Wang","Yuanqi Du","Soha Hassoun","Li-Ping Liu"],"pdf_url":"https://arxiv.org/pdf/2309.12931v2.pdf","comment":"NIPS 2023"},{"id":"http://arxiv.org/abs/2309.01029v3","updated":"2023-11-28T19:04:45Z","published":"2023-09-02T22:14:26Z","title":"Explainability for Large Language Models: A Survey","summary":" Large language models (LLMs) have demonstrated impressive capabilities in\nnatural language processing. However, their internal mechanisms are still\nunclear and this lack of transparency poses unwanted risks for downstream\napplications. Therefore, understanding and explaining these models is crucial\nfor elucidating their behaviors, limitations, and social impacts. In this\npaper, we introduce a taxonomy of explainability techniques and provide a\nstructured overview of methods for explaining Transformer-based language\nmodels. We categorize techniques based on the training paradigms of LLMs:\ntraditional fine-tuning-based paradigm and prompting-based paradigm. For each\nparadigm, we summarize the goals and dominant approaches for generating local\nexplanations of individual predictions and global explanations of overall model\nknowledge. We also discuss metrics for evaluating generated explanations, and\ndiscuss how explanations can be leveraged to debug models and improve\nperformance. Lastly, we examine key challenges and emerging opportunities for\nexplanation techniques in the era of LLMs in comparison to conventional machine\nlearning models.\n","authors":["Haiyan Zhao","Hanjie Chen","Fan Yang","Ninghao Liu","Huiqi Deng","Hengyi Cai","Shuaiqiang Wang","Dawei Yin","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2309.01029v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17154v1","updated":"2023-11-28T19:00:03Z","published":"2023-11-28T19:00:03Z","title":"Pragmatic Radiology Report Generation","summary":" When pneumonia is not found on a chest X-ray, should the report describe this\nnegative observation or omit it? We argue that this question cannot be answered\nfrom the X-ray alone and requires a pragmatic perspective, which captures the\ncommunicative goal that radiology reports serve between radiologists and\npatients. However, the standard image-to-text formulation for radiology report\ngeneration fails to incorporate such pragmatic intents. Following this\npragmatic perspective, we demonstrate that the indication, which describes why\na patient comes for an X-ray, drives the mentions of negative observations and\nintroduce indications as additional input to report generation. With respect to\nthe output, we develop a framework to identify uninferable information from the\nimage as a source of model hallucinations, and limit them by cleaning\ngroundtruth reports. Finally, we use indications and cleaned groundtruth\nreports to develop pragmatic models, and show that they outperform existing\nmethods not only in new pragmatics-inspired metrics (+4.3 Negative F1) but also\nin standard metrics (+6.3 Positive F1 and +11.0 BLEU-2).\n","authors":["Dang Nguyen","Chacha Chen","He He","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2311.17154v1.pdf","comment":"18 pages, 1 figure, 18 tables. Code at\n https://github.com/ChicagoHAI/llm_radiology"},{"id":"http://arxiv.org/abs/2311.17136v1","updated":"2023-11-28T18:55:52Z","published":"2023-11-28T18:55:52Z","title":"UniIR: Training and Benchmarking Universal Multimodal Information\n Retrievers","summary":" Existing information retrieval (IR) models often assume a homogeneous format,\nlimiting their applicability to diverse user needs, such as searching for\nimages with text descriptions, searching for a news article with a headline\nimage, or finding a similar photo with a query image. To approach such\ndifferent information-seeking demands, we introduce UniIR, a unified\ninstruction-guided multimodal retriever capable of handling eight distinct\nretrieval tasks across modalities. UniIR, a single retrieval system jointly\ntrained on ten diverse multimodal-IR datasets, interprets user instructions to\nexecute various retrieval tasks, demonstrating robust performance across\nexisting datasets and zero-shot generalization to new tasks. Our experiments\nhighlight that multi-task training and instruction tuning are keys to UniIR's\ngeneralization ability. Additionally, we construct the M-BEIR, a multimodal\nretrieval benchmark with comprehensive results, to standardize the evaluation\nof universal multimodal information retrieval.\n","authors":["Cong Wei","Yang Chen","Haonan Chen","Hexiang Hu","Ge Zhang","Jie Fu","Alan Ritter","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17136v1.pdf","comment":"Our code and dataset are available on this project page:\n https://tiger-ai-lab.github.io/UniIR/"},{"id":"http://arxiv.org/abs/2311.17126v1","updated":"2023-11-28T14:51:13Z","published":"2023-11-28T14:51:13Z","title":"Reason out Your Layout: Evoking the Layout Master from Large Language\n Models for Text-to-Image Synthesis","summary":" Recent advancements in text-to-image (T2I) generative models have shown\nremarkable capabilities in producing diverse and imaginative visuals based on\ntext prompts. Despite the advancement, these diffusion models sometimes\nstruggle to translate the semantic content from the text into images entirely.\nWhile conditioning on the layout has shown to be effective in improving the\ncompositional ability of T2I diffusion models, they typically require manual\nlayout input. In this work, we introduce a novel approach to improving T2I\ndiffusion models using Large Language Models (LLMs) as layout generators. Our\nmethod leverages the Chain-of-Thought prompting of LLMs to interpret text and\ngenerate spatially reasonable object layouts. The generated layout is then used\nto enhance the generated images' composition and spatial accuracy. Moreover, we\npropose an efficient adapter based on a cross-attention mechanism, which\nexplicitly integrates the layout information into the stable diffusion models.\nOur experiments demonstrate significant improvements in image quality and\nlayout accuracy, showcasing the potential of LLMs in augmenting generative\nimage models.\n","authors":["Xiaohui Chen","Yongfei Liu","Yingxiang Yang","Jianbo Yuan","Quanzeng You","Li-Ping Liu","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2311.17126v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2311.17107v1","updated":"2023-11-28T10:26:57Z","published":"2023-11-28T10:26:57Z","title":"ClimateX: Do LLMs Accurately Assess Human Expert Confidence in Climate\n Statements?","summary":" Evaluating the accuracy of outputs generated by Large Language Models (LLMs)\nis especially important in the climate science and policy domain. We introduce\nthe Expert Confidence in Climate Statements (ClimateX) dataset, a novel,\ncurated, expert-labeled dataset consisting of 8094 climate statements collected\nfrom the latest Intergovernmental Panel on Climate Change (IPCC) reports,\nlabeled with their associated confidence levels. Using this dataset, we show\nthat recent LLMs can classify human expert confidence in climate-related\nstatements, especially in a few-shot learning setting, but with limited (up to\n47%) accuracy. Overall, models exhibit consistent and significant\nover-confidence on low and medium confidence statements. We highlight\nimplications of our results for climate communication, LLMs evaluation\nstrategies, and the use of LLMs in information retrieval systems.\n","authors":["Romain Lacombe","Kerrie Wu","Eddie Dilworth"],"pdf_url":"https://arxiv.org/pdf/2311.17107v1.pdf","comment":"Tackling Climate Change with Machine Learning workshop at NeurIPS\n 2023"},{"id":"http://arxiv.org/abs/2311.17086v1","updated":"2023-11-28T02:31:52Z","published":"2023-11-28T02:31:52Z","title":"PEA-Diffusion: Parameter-Efficient Adapter with Knowledge Distillation\n in non-English Text-to-Image Generation","summary":" Text-to-image diffusion models are well-known for their ability to generate\nrealistic images based on textual prompts. However, the existing works have\npredominantly focused on English, lacking support for non-English text-to-image\nmodels. The most commonly used translation methods cannot solve the generation\nproblem related to language culture, while training from scratch on a specific\nlanguage dataset is prohibitively expensive. In this paper, we are inspired to\npropose a simple plug-and-play language transfer method based on knowledge\ndistillation. All we need to do is train a lightweight MLP-like\nparameter-efficient adapter (PEA) with only 6M parameters under teacher\nknowledge distillation along with a small parallel data corpus. We are\nsurprised to find that freezing the parameters of UNet can still achieve\nremarkable performance on the language-specific prompt evaluation set,\ndemonstrating that PEA can stimulate the potential generation ability of the\noriginal UNet. Additionally, it closely approaches the performance of the\nEnglish text-to-image model on a general prompt evaluation set. Furthermore,\nour adapter can be used as a plugin to achieve significant results in\ndownstream tasks in cross-lingual text-to-image generation. Code will be\navailable at: https://github.com/OPPO-Mente-Lab/PEA-Diffusion\n","authors":["Jian Ma","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2311.17086v1.pdf","comment":"17 pages, 13 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.17060v1","updated":"2023-11-28T18:59:58Z","published":"2023-11-28T18:59:58Z","title":"Material Palette: Extraction of Materials from a Single Image","summary":" In this paper, we propose a method to extract physically-based rendering\n(PBR) materials from a single real-world image. We do so in two steps: first,\nwe map regions of the image to material concepts using a diffusion model, which\nallows the sampling of texture images resembling each material in the scene.\nSecond, we benefit from a separate network to decompose the generated textures\ninto Spatially Varying BRDFs (SVBRDFs), providing us with materials ready to be\nused in rendering applications. Our approach builds on existing synthetic\nmaterial libraries with SVBRDF ground truth, but also exploits a\ndiffusion-generated RGB texture dataset to allow generalization to new samples\nusing unsupervised domain adaptation (UDA). Our contributions are thoroughly\nevaluated on synthetic and real-world datasets. We further demonstrate the\napplicability of our method for editing 3D scenes with materials estimated from\nreal photographs. The code and models will be made open-source. Project page:\nhttps://astra-vision.github.io/MaterialPalette/\n","authors":["Ivan Lopes","Fabio Pizzati","Raoul de Charette"],"pdf_url":"https://arxiv.org/pdf/2311.17060v1.pdf","comment":"8 pages, 11 figures, 2 tables. Webpage\n https://astra-vision.github.io/MaterialPalette/"},{"id":"http://arxiv.org/abs/2311.17061v1","updated":"2023-11-28T18:59:58Z","published":"2023-11-28T18:59:58Z","title":"HumanGaussian: Text-Driven 3D Human Generation with Gaussian Splatting","summary":" Realistic 3D human generation from text prompts is a desirable yet\nchallenging task. Existing methods optimize 3D representations like mesh or\nneural fields via score distillation sampling (SDS), which suffers from\ninadequate fine details or excessive training time. In this paper, we propose\nan efficient yet effective framework, HumanGaussian, that generates\nhigh-quality 3D humans with fine-grained geometry and realistic appearance. Our\nkey insight is that 3D Gaussian Splatting is an efficient renderer with\nperiodic Gaussian shrinkage or growing, where such adaptive density control can\nbe naturally guided by intrinsic human structures. Specifically, 1) we first\npropose a Structure-Aware SDS that simultaneously optimizes human appearance\nand geometry. The multi-modal score function from both RGB and depth space is\nleveraged to distill the Gaussian densification and pruning process. 2)\nMoreover, we devise an Annealed Negative Prompt Guidance by decomposing SDS\ninto a noisier generative score and a cleaner classifier score, which well\naddresses the over-saturation issue. The floating artifacts are further\neliminated based on Gaussian size in a prune-only phase to enhance generation\nsmoothness. Extensive experiments demonstrate the superior efficiency and\ncompetitive quality of our framework, rendering vivid 3D humans under diverse\nscenarios. Project Page: https://alvinliu0.github.io/projects/HumanGaussian\n","authors":["Xian Liu","Xiaohang Zhan","Jiaxiang Tang","Ying Shan","Gang Zeng","Dahua Lin","Xihui Liu","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17061v1.pdf","comment":"Project Page: https://alvinliu0.github.io/projects/HumanGaussian"},{"id":"http://arxiv.org/abs/2311.17058v1","updated":"2023-11-28T18:59:57Z","published":"2023-11-28T18:59:57Z","title":"Panoptic Video Scene Graph Generation","summary":" Towards building comprehensive real-world visual perception systems, we\npropose and study a new problem called panoptic scene graph generation (PVSG).\nPVSG relates to the existing video scene graph generation (VidSGG) problem,\nwhich focuses on temporal interactions between humans and objects grounded with\nbounding boxes in videos. However, the limitation of bounding boxes in\ndetecting non-rigid objects and backgrounds often causes VidSGG to miss key\ndetails crucial for comprehensive video understanding. In contrast, PVSG\nrequires nodes in scene graphs to be grounded by more precise, pixel-level\nsegmentation masks, which facilitate holistic scene understanding. To advance\nresearch in this new area, we contribute the PVSG dataset, which consists of\n400 videos (289 third-person + 111 egocentric videos) with a total of 150K\nframes labeled with panoptic segmentation masks as well as fine, temporal scene\ngraphs. We also provide a variety of baseline methods and share useful design\npractices for future work.\n","authors":["Jingkang Yang","Wenxuan Peng","Xiangtai Li","Zujin Guo","Liangyu Chen","Bo Li","Zheng Ma","Kaiyang Zhou","Wayne Zhang","Chen Change Loy","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17058v1.pdf","comment":"Accepted to CVPR 2023. Project Page:\n https://jingkang50.github.io/PVSG/. Codebase:\n https://github.com/LilyDaytoy/OpenPVSG. We provide 400 long videos with\n frame-level panoptic segmentation, scene graph, dense captions, and QA\n annotations"},{"id":"http://arxiv.org/abs/2311.17057v1","updated":"2023-11-28T18:59:52Z","published":"2023-11-28T18:59:52Z","title":"ReMoS: Reactive 3D Motion Synthesis for Two-Person Interactions","summary":" Current approaches for 3D human motion synthesis can generate high-quality 3D\nanimations of digital humans performing a wide variety of actions and gestures.\nHowever, there is still a notable technological gap in addressing the complex\ndynamics of multi-human interactions within this paradigm. In this work, we\nintroduce ReMoS, a denoising diffusion-based probabilistic model for reactive\nmotion synthesis that explores two-person interactions. Given the motion of one\nperson, we synthesize the reactive motion of the second person to complete the\ninteractions between the two. In addition to synthesizing the full-body\nmotions, we also synthesize plausible hand interactions. We show the\nperformance of ReMoS under a wide range of challenging two-person scenarios\nincluding pair-dancing, Ninjutsu, kickboxing, and acrobatics, where one\nperson's movements have complex and diverse influences on the motions of the\nother. We further propose the ReMoCap dataset for two-person interactions\nconsisting of full-body and hand motions. We evaluate our approach through\nmultiple quantitative metrics, qualitative visualizations, and a user study.\nOur results are usable in interactive applications while also providing an\nadequate amount of control for animators.\n","authors":["Anindita Ghosh","Rishabh Dabral","Vladislav Golyanik","Christian Theobalt","Philipp Slusallek"],"pdf_url":"https://arxiv.org/pdf/2311.17057v1.pdf","comment":"13 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.17056v1","updated":"2023-11-28T18:59:51Z","published":"2023-11-28T18:59:51Z","title":"Self-Supervised Motion Magnification by Backpropagating Through Optical\n Flow","summary":" This paper presents a simple, self-supervised method for magnifying subtle\nmotions in video: given an input video and a magnification factor, we\nmanipulate the video such that its new optical flow is scaled by the desired\namount. To train our model, we propose a loss function that estimates the\noptical flow of the generated video and penalizes how far if deviates from the\ngiven magnification factor. Thus, training involves differentiating through a\npretrained optical flow network. Since our model is self-supervised, we can\nfurther improve its performance through test-time adaptation, by finetuning it\non the input video. It can also be easily extended to magnify the motions of\nonly user-selected objects. Our approach avoids the need for synthetic\nmagnification datasets that have been used to train prior learning-based\napproaches. Instead, it leverages the existing capabilities of off-the-shelf\nmotion estimators. We demonstrate the effectiveness of our method through\nevaluations of both visual quality and quantitative metrics on a range of\nreal-world and synthetic videos, and we show our method works for both\nsupervised and unsupervised optical flow methods.\n","authors":["Zhaoying Pan","Daniel Geng","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2311.17056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16504v1","updated":"2023-11-28T18:59:50Z","published":"2023-11-28T18:59:50Z","title":"Rethinking Directional Integration in Neural Radiance Fields","summary":" Recent works use the Neural radiance field (NeRF) to perform multi-view 3D\nreconstruction, providing a significant leap in rendering photorealistic\nscenes. However, despite its efficacy, NeRF exhibits limited capability of\nlearning view-dependent effects compared to light field rendering or\nimage-based view synthesis. To that end, we introduce a modification to the\nNeRF rendering equation which is as simple as a few lines of code change for\nany NeRF variations, while greatly improving the rendering quality of\nview-dependent effects. By swapping the integration operator and the direction\ndecoder network, we only integrate the positional features along the ray and\nmove the directional terms out of the integration, resulting in a\ndisentanglement of the view-dependent and independent components. The modified\nequation is equivalent to the classical volumetric rendering in ideal cases on\nobject surfaces with Dirac densities. Furthermore, we prove that with the\nerrors caused by network approximation and numerical integration, our rendering\nequation exhibits better convergence properties with lower error accumulations\ncompared to the classical NeRF. We also show that the modified equation can be\ninterpreted as light field rendering with learned ray embeddings. Experiments\non different NeRF variations show consistent improvements in the quality of\nview-dependent effects with our simple modification.\n","authors":["Congyue Deng","Jiawei Yang","Leonidas Guibas","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2311.16504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03089v2","updated":"2023-11-28T18:59:46Z","published":"2023-06-05T17:59:05Z","title":"Brain Diffusion for Visual Exploration: Cortical Discovery using Large\n Scale Generative Models","summary":" A long standing goal in neuroscience has been to elucidate the functional\norganization of the brain. Within higher visual cortex, functional accounts\nhave remained relatively coarse, focusing on regions of interest (ROIs) and\ntaking the form of selectivity for broad categories such as faces, places,\nbodies, food, or words. Because the identification of such ROIs has typically\nrelied on manually assembled stimulus sets consisting of isolated objects in\nnon-ecological contexts, exploring functional organization without robust a\npriori hypotheses has been challenging. To overcome these limitations, we\nintroduce a data-driven approach in which we synthesize images predicted to\nactivate a given brain region using paired natural images and fMRI recordings,\nbypassing the need for category-specific stimuli. Our approach -- Brain\nDiffusion for Visual Exploration (\"BrainDiVE\") -- builds on recent generative\nmethods by combining large-scale diffusion models with brain-guided image\nsynthesis. Validating our method, we demonstrate the ability to synthesize\npreferred images with appropriate semantic specificity for well-characterized\ncategory-selective ROIs. We then show that BrainDiVE can characterize\ndifferences between ROIs selective for the same high-level category. Finally we\nidentify novel functional subdivisions within these ROIs, validated with\nbehavioral data. These results advance our understanding of the fine-grained\nfunctional organization of human visual cortex, and provide well-specified\nconstraints for further examination of cortical organization using\nhypothesis-driven methods.\n","authors":["Andrew F. Luo","Margaret M. Henderson","Leila Wehbe","Michael J. Tarr"],"pdf_url":"https://arxiv.org/pdf/2306.03089v2.pdf","comment":"NeurIPS 2023 (Oral). Project page:\n https://www.cs.cmu.edu/~afluo/BrainDiVE/"},{"id":"http://arxiv.org/abs/2311.17055v1","updated":"2023-11-28T18:59:46Z","published":"2023-11-28T18:59:46Z","title":"No Representation Rules Them All in Category Discovery","summary":" In this paper we tackle the problem of Generalized Category Discovery (GCD).\nSpecifically, given a dataset with labelled and unlabelled images, the task is\nto cluster all images in the unlabelled subset, whether or not they belong to\nthe labelled categories. Our first contribution is to recognize that most\nexisting GCD benchmarks only contain labels for a single clustering of the\ndata, making it difficult to ascertain whether models are using the available\nlabels to solve the GCD task, or simply solving an unsupervised clustering\nproblem. As such, we present a synthetic dataset, named 'Clevr-4', for category\ndiscovery. Clevr-4 contains four equally valid partitions of the data, i.e\nbased on object shape, texture, color or count. To solve the task, models are\nrequired to extrapolate the taxonomy specified by the labelled set, rather than\nsimply latching onto a single natural grouping of the data. We use this dataset\nto demonstrate the limitations of unsupervised clustering in the GCD setting,\nshowing that even very strong unsupervised models fail on Clevr-4. We further\nuse Clevr-4 to examine the weaknesses of existing GCD algorithms, and propose a\nnew method which addresses these shortcomings, leveraging consistent findings\nfrom the representation learning literature to do so. Our simple solution,\nwhich is based on 'mean teachers' and termed $\\mu$GCD, substantially\noutperforms implemented baselines on Clevr-4. Finally, when we transfer these\nfindings to real data on the challenging Semantic Shift Benchmark (SSB), we\nfind that $\\mu$GCD outperforms all prior work, setting a new state-of-the-art.\nFor the project webpage, see https://www.robots.ox.ac.uk/~vgg/data/clevr4/\n","authors":["Sagar Vaze","Andrea Vedaldi","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2311.17055v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17053v1","updated":"2023-11-28T18:58:48Z","published":"2023-11-28T18:58:48Z","title":"DiffuseBot: Breeding Soft Robots With Physics-Augmented Generative\n Diffusion Models","summary":" Nature evolves creatures with a high complexity of morphological and\nbehavioral intelligence, meanwhile computational methods lag in approaching\nthat diversity and efficacy. Co-optimization of artificial creatures'\nmorphology and control in silico shows promise for applications in physical\nsoft robotics and virtual character creation; such approaches, however, require\ndeveloping new learning algorithms that can reason about function atop pure\nstructure. In this paper, we present DiffuseBot, a physics-augmented diffusion\nmodel that generates soft robot morphologies capable of excelling in a wide\nspectrum of tasks. DiffuseBot bridges the gap between virtually generated\ncontent and physical utility by (i) augmenting the diffusion process with a\nphysical dynamical simulation which provides a certificate of performance, and\n(ii) introducing a co-design procedure that jointly optimizes physical design\nand control by leveraging information about physical sensitivities from\ndifferentiable simulation. We showcase a range of simulated and fabricated\nrobots along with their capabilities. Check our website at\nhttps://diffusebot.github.io/\n","authors":["Tsun-Hsuan Wang","Juntian Zheng","Pingchuan Ma","Yilun Du","Byungchul Kim","Andrew Spielberg","Joshua Tenenbaum","Chuang Gan","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2311.17053v1.pdf","comment":"NeurIPS 2023. Project page: https://diffusebot.github.io/"},{"id":"http://arxiv.org/abs/2311.17050v1","updated":"2023-11-28T18:56:01Z","published":"2023-11-28T18:56:01Z","title":"Surf-D: High-Quality Surface Generation for Arbitrary Topologies using\n Diffusion Models","summary":" In this paper, we present Surf-D, a novel method for generating high-quality\n3D shapes as Surfaces with arbitrary topologies using Diffusion models.\nSpecifically, we adopt Unsigned Distance Field (UDF) as the surface\nrepresentation, as it excels in handling arbitrary topologies, enabling the\ngeneration of complex shapes. While the prior methods explored shape generation\nwith different representations, they suffer from limited topologies and\ngeometry details. Moreover, it's non-trivial to directly extend prior diffusion\nmodels to UDF because they lack spatial continuity due to the discrete volume\nstructure. However, UDF requires accurate gradients for mesh extraction and\nlearning. To tackle the issues, we first leverage a point-based auto-encoder to\nlearn a compact latent space, which supports gradient querying for any input\npoint through differentiation to effectively capture intricate geometry at a\nhigh resolution. Since the learning difficulty for various shapes can differ, a\ncurriculum learning strategy is employed to efficiently embed various surfaces,\nenhancing the whole embedding process. With pretrained shape latent space, we\nemploy a latent diffusion model to acquire the distribution of various shapes.\nOur approach demonstrates superior performance in shape generation across\nmultiple modalities and conducts extensive experiments in unconditional\ngeneration, category conditional generation, 3D reconstruction from images, and\ntext-to-shape tasks.\n","authors":["Zhengming Yu","Zhiyang Dou","Xiaoxiao Long","Cheng Lin","Zekun Li","Yuan Liu","Norman Müller","Taku Komura","Marc Habermann","Christian Theobalt","Xin Li","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17050v1.pdf","comment":"Project Page: https://yzmblog.github.io/projects/SurfD/"},{"id":"http://arxiv.org/abs/2311.17049v1","updated":"2023-11-28T18:55:42Z","published":"2023-11-28T18:55:42Z","title":"MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced\n Training","summary":" Contrastive pretraining of image-text foundation models, such as CLIP,\ndemonstrated excellent zero-shot performance and improved robustness on a wide\nrange of downstream tasks. However, these models utilize large\ntransformer-based encoders with significant memory and latency overhead which\npose challenges for deployment on mobile devices. In this work, we introduce\nMobileCLIP -- a new family of efficient image-text models optimized for runtime\nperformance along with a novel and efficient training approach, namely\nmulti-modal reinforced training. The proposed training approach leverages\nknowledge transfer from an image captioning model and an ensemble of strong\nCLIP encoders to improve the accuracy of efficient models. Our approach avoids\ntrain-time compute overhead by storing the additional knowledge in a reinforced\ndataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for\nzero-shot classification and retrieval tasks on several datasets. Our\nMobileCLIP-S2 variant is 2.3$\\times$ faster while more accurate compared to\nprevious best CLIP model based on ViT-B/16. We further demonstrate the\neffectiveness of our multi-modal reinforced training by training a CLIP model\nbased on ViT-B/16 image backbone and achieving +2.9% average performance\nimprovement on 38 evaluation benchmarks compared to the previous best.\nMoreover, we show that the proposed approach achieves 10$\\times$-1000$\\times$\nimproved learning efficiency when compared with non-reinforced CLIP training.\n","authors":["Pavan Kumar Anasosalu Vasu","Hadi Pouransari","Fartash Faghri","Raviteja Vemulapalli","Oncel Tuzel"],"pdf_url":"https://arxiv.org/pdf/2311.17049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17048v1","updated":"2023-11-28T18:55:37Z","published":"2023-11-28T18:55:37Z","title":"Zero-shot Referring Expression Comprehension via Structural Similarity\n Between Images and Captions","summary":" Zero-shot referring expression comprehension aims at localizing bounding\nboxes in an image corresponding to the provided textual prompts, which\nrequires: (i) a fine-grained disentanglement of complex visual scene and\ntextual context, and (ii) a capacity to understand relationships among\ndisentangled entities. Unfortunately, existing large vision-language alignment\n(VLA) models, e.g., CLIP, struggle with both aspects so cannot be directly used\nfor this task. To mitigate this gap, we leverage large foundation models to\ndisentangle both images and texts into triplets in the format of (subject,\npredicate, object). After that, grounding is accomplished by calculating the\nstructural similarity matrix between visual and textual triplets with a VLA\nmodel, and subsequently propagate it to an instance-level similarity matrix.\nFurthermore, to equip VLA models with the ability of relationship\nunderstanding, we design a triplet-matching objective to fine-tune the VLA\nmodels on a collection of curated dataset containing abundant entity\nrelationships. Experiments demonstrate that our visual grounding performance\nincrease of up to 19.5% over the SOTA zero-shot model on RefCOCO/+/g. On the\nmore challenging Who's Waldo dataset, our zero-shot approach achieves\ncomparable accuracy to the fully supervised model.\n","authors":["Zeyu Han","Fangrui Zhu","Qianru Lao","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17043v1","updated":"2023-11-28T18:53:43Z","published":"2023-11-28T18:53:43Z","title":"LLaMA-VID: An Image is Worth 2 Tokens in Large Language Models","summary":" In this work, we present a novel method to tackle the token generation\nchallenge in Vision Language Models (VLMs) for video and image understanding,\ncalled LLaMA-VID. Current VLMs, while proficient in tasks like image captioning\nand visual question answering, face computational burdens when processing long\nvideos due to the excessive visual tokens. LLaMA-VID addresses this issue by\nrepresenting each frame with two distinct tokens, namely context token and\ncontent token. The context token encodes the overall image context based on\nuser input, whereas the content token encapsulates visual cues in each frame.\nThis dual-token strategy significantly reduces the overload of long videos\nwhile preserving critical information. Generally, LLaMA-VID empowers existing\nframeworks to support hour-long videos and pushes their upper limit with an\nextra context token. It is proved to surpass previous methods on most of video-\nor image-based benchmarks. Code is available\nhttps://github.com/dvlab-research/LLaMA-VID}{https://github.com/dvlab-research/LLaMA-VID\n","authors":["Yanwei Li","Chengyao Wang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2311.17043v1.pdf","comment":"Code is available at https://github.com/dvlab-research/LLaMA-VID"},{"id":"http://arxiv.org/abs/2311.17042v1","updated":"2023-11-28T18:53:24Z","published":"2023-11-28T18:53:24Z","title":"Adversarial Diffusion Distillation","summary":" We introduce Adversarial Diffusion Distillation (ADD), a novel training\napproach that efficiently samples large-scale foundational image diffusion\nmodels in just 1-4 steps while maintaining high image quality. We use score\ndistillation to leverage large-scale off-the-shelf image diffusion models as a\nteacher signal in combination with an adversarial loss to ensure high image\nfidelity even in the low-step regime of one or two sampling steps. Our analyses\nshow that our model clearly outperforms existing few-step methods (GANs, Latent\nConsistency Models) in a single step and reaches the performance of\nstate-of-the-art diffusion models (SDXL) in only four steps. ADD is the first\nmethod to unlock single-step, real-time image synthesis with foundation models.\nCode and weights available under\nhttps://github.com/Stability-AI/generative-models and\nhttps://huggingface.co/stabilityai/ .\n","authors":["Axel Sauer","Dominik Lorenz","Andreas Blattmann","Robin Rombach"],"pdf_url":"https://arxiv.org/pdf/2311.17042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17041v1","updated":"2023-11-28T18:53:06Z","published":"2023-11-28T18:53:06Z","title":"Efficient In-Context Learning in Vision-Language Models for Egocentric\n Videos","summary":" Recent advancements in text-only large language models (LLMs) have\nhighlighted the benefit of in-context learning for adapting to new tasks with a\nfew demonstrations. However, extending in-context learning to large\nvision-language models (VLMs) using a huge amount of naturalistic\nvision-language data has shown limited success, particularly for egocentric\nvideos, due to high data collection costs. We propose a novel training method\n$\\mathbb{E}$fficient $\\mathbb{I}$n-context $\\mathbb{L}$earning on\n$\\mathbb{E}$gocentric $\\mathbb{V}$ideos ($\\mathbb{EILEV}$), which elicits\nin-context learning in VLMs for egocentric videos without requiring massive,\nnaturalistic egocentric video datasets. $\\mathbb{EILEV}$ involves architectural\nand training data adaptations to allow the model to process contexts\ninterleaved with video clips and narrations, sampling of in-context examples\nwith clusters of similar verbs and nouns, use of data with skewed marginal\ndistributions with a long tail of infrequent verbs and nouns, as well as\nhomonyms and synonyms. Our evaluations show that $\\mathbb{EILEV}$-trained\nmodels outperform larger VLMs trained on a huge amount of naturalistic data in\nin-context learning. Furthermore, they can generalize to not only\nout-of-distribution, but also novel, rare egocentric videos and texts via\nin-context learning, demonstrating potential for applications requiring\ncost-effective training, and rapid post-deployment adaptability. Our code and\ndemo are available at \\url{https://github.com/yukw777/EILEV}.\n","authors":["Keunwoo Peter Yu","Zheyuan Zhang","Fengyuan Hu","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2311.17041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17034v1","updated":"2023-11-28T18:45:13Z","published":"2023-11-28T18:45:13Z","title":"Telling Left from Right: Identifying Geometry-Aware Semantic\n Correspondence","summary":" While pre-trained large-scale vision models have shown significant promise\nfor semantic correspondence, their features often struggle to grasp the\ngeometry and orientation of instances. This paper identifies the importance of\nbeing geometry-aware for semantic correspondence and reveals a limitation of\nthe features of current foundation models under simple post-processing. We show\nthat incorporating this information can markedly enhance semantic\ncorrespondence performance with simple but effective solutions in both\nzero-shot and supervised settings. We also construct a new challenging\nbenchmark for semantic correspondence built from an existing animal pose\nestimation dataset, for both pre-training validating models. Our method\nachieves a PCK@0.10 score of 64.2 (zero-shot) and 85.6 (supervised) on the\nchallenging SPair-71k dataset, outperforming the state-of-the-art by 4.3p and\n11.0p absolute gains, respectively. Our code and datasets will be publicly\navailable.\n","authors":["Junyi Zhang","Charles Herrmann","Junhwa Hur","Eric Chen","Varun Jampani","Deqing Sun","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.17034v1.pdf","comment":"Project page: https://telling-left-from-right.github.io/"},{"id":"http://arxiv.org/abs/2311.17026v1","updated":"2023-11-28T18:28:03Z","published":"2023-11-28T18:28:03Z","title":"When the Few Outweigh the Many: Illicit Content Recognition with\n Few-Shot Learning","summary":" The anonymity and untraceability benefits of the Dark web account for the\nexponentially-increased potential of its popularity while creating a suitable\nwomb for many illicit activities, to date. Hence, in collaboration with\ncybersecurity and law enforcement agencies, research has provided approaches\nfor recognizing and classifying illicit activities with most exploiting textual\ndark web markets' content recognition; few such approaches use images that\noriginated from dark web content. This paper investigates this alternative\ntechnique for recognizing illegal activities from images. In particular, we\ninvestigate label-agnostic learning techniques like One-Shot and Few-Shot\nlearning featuring the use Siamese neural networks, a state-of-the-art approach\nin the field. Our solution manages to handle small-scale datasets with\npromising accuracy. In particular, Siamese neural networks reach 90.9% on\n20-Shot experiments over a 10-class dataset; this leads us to conclude that\nsuch models are a promising and cheaper alternative to the definition of\nautomated law-enforcing machinery over the dark web.\n","authors":["G. Cascavilla","G. Catolino","M. Conti","D. Mellios","D. A. Tamburri"],"pdf_url":"https://arxiv.org/pdf/2311.17026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17024v1","updated":"2023-11-28T18:27:15Z","published":"2023-11-28T18:27:15Z","title":"Diffusion 3D Features (Diff3F): Decorating Untextured Shapes with\n Distilled Semantic Features","summary":" We present Diff3F as a simple, robust, and class-agnostic feature descriptor\nthat can be computed for untextured input shapes (meshes or point clouds). Our\nmethod distills diffusion features from image foundational models onto input\nshapes. Specifically, we use the input shapes to produce depth and normal maps\nas guidance for conditional image synthesis, and in the process produce\n(diffusion) features in 2D that we subsequently lift and aggregate on the\noriginal surface. Our key observation is that even if the conditional image\ngenerations obtained from multi-view rendering of the input shapes are\ninconsistent, the associated image features are robust and can be directly\naggregated across views. This produces semantic features on the input shapes,\nwithout requiring additional data or training. We perform extensive experiments\non multiple benchmarks (SHREC'19, SHREC'20, and TOSCA) and demonstrate that our\nfeatures, being semantic instead of geometric, produce reliable correspondence\nacross both isometeric and non-isometrically related shape families.\n","authors":["Niladri Shekhar Dutt","Sanjeev Muralikrishnan","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.17024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16103v2","updated":"2023-11-28T18:16:29Z","published":"2023-11-27T18:59:58Z","title":"Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating\n Video-based Large Language Models","summary":" Video-based large language models (Video-LLMs) have been recently introduced,\ntargeting both fundamental improvements in perception and comprehension, and a\ndiverse range of user inquiries. In pursuit of the ultimate goal of achieving\nartificial general intelligence, a truly intelligent Video-LLM model should not\nonly see and understand the surroundings, but also possess human-level\ncommonsense, and make well-informed decisions for the users. To guide the\ndevelopment of such a model, the establishment of a robust and comprehensive\nevaluation system becomes crucial. To this end, this paper proposes\n\\textit{Video-Bench}, a new comprehensive benchmark along with a toolkit\nspecifically designed for evaluating Video-LLMs. The benchmark comprises 10\nmeticulously crafted tasks, evaluating the capabilities of Video-LLMs across\nthree distinct levels: Video-exclusive Understanding, Prior Knowledge-based\nQuestion-Answering, and Comprehension and Decision-making. In addition, we\nintroduce an automatic toolkit tailored to process model outputs for various\ntasks, facilitating the calculation of metrics and generating convenient final\nscores. We evaluate 8 representative Video-LLMs using \\textit{Video-Bench}. The\nfindings reveal that current Video-LLMs still fall considerably short of\nachieving human-like comprehension and analysis of real-world videos, offering\nvaluable insights for future research directions. The benchmark and toolkit are\navailable at: \\url{https://github.com/PKU-YuanGroup/Video-Bench}.\n","authors":["Munan Ning","Bin Zhu","Yujia Xie","Bin Lin","Jiaxi Cui","Lu Yuan","Dongdong Chen","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.16103v2.pdf","comment":"Benchmark is available at\n https://github.com/PKU-YuanGroup/Video-Bench"},{"id":"http://arxiv.org/abs/2311.17009v1","updated":"2023-11-28T18:03:27Z","published":"2023-11-28T18:03:27Z","title":"Space-Time Diffusion Features for Zero-Shot Text-Driven Motion Transfer","summary":" We present a new method for text-driven motion transfer - synthesizing a\nvideo that complies with an input text prompt describing the target objects and\nscene while maintaining an input video's motion and scene layout. Prior methods\nare confined to transferring motion across two subjects within the same or\nclosely related object categories and are applicable for limited domains (e.g.,\nhumans). In this work, we consider a significantly more challenging setting in\nwhich the target and source objects differ drastically in shape and\nfine-grained motion characteristics (e.g., translating a jumping dog into a\ndolphin). To this end, we leverage a pre-trained and fixed text-to-video\ndiffusion model, which provides us with generative and motion priors. The\npillar of our method is a new space-time feature loss derived directly from the\nmodel. This loss guides the generation process to preserve the overall motion\nof the input video while complying with the target object in terms of shape and\nfine-grained motion traits.\n","authors":["Danah Yatim","Rafail Fridman","Omer Bar Tal","Yoni Kasten","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2311.17009v1.pdf","comment":"Project page: https://diffusion-motion-transfer.github.io/"},{"id":"http://arxiv.org/abs/2311.17005v1","updated":"2023-11-28T17:59:04Z","published":"2023-11-28T17:59:04Z","title":"MVBench: A Comprehensive Multi-modal Video Understanding Benchmark","summary":" With the rapid development of Multi-modal Large Language Models (MLLMs), a\nnumber of diagnostic benchmarks have recently emerged to evaluate the\ncomprehension capabilities of these models. However, most benchmarks\npredominantly assess spatial understanding in the static image tasks, while\noverlooking temporal understanding in the dynamic video tasks. To alleviate\nthis issue, we introduce a comprehensive Multi-modal Video understanding\nBenchmark, namely MVBench, which covers 20 challenging video tasks that cannot\nbe effectively solved with a single frame. Specifically, we first introduce a\nnovel static-to-dynamic method to define these temporal-related tasks. By\ntransforming various static tasks into dynamic ones, we enable the systematic\ngeneration of video tasks that require a broad spectrum of temporal skills,\nranging from perception to cognition. Then, guided by the task definition, we\nautomatically convert public video annotations into multiple-choice QA to\nevaluate each task. On one hand, such a distinct paradigm allows us to build\nMVBench efficiently, without much manual intervention. On the other hand, it\nguarantees evaluation fairness with ground-truth video annotations, avoiding\nthe biased scoring of LLMs. Moreover, we further develop a robust video MLLM\nbaseline, i.e., VideoChat2, by progressive multi-modal training with diverse\ninstruction-tuning data. The extensive results on our MVBench reveal that, the\nexisting MLLMs are far from satisfactory in temporal understanding, while our\nVideoChat2 largely surpasses these leading models by over 15% on MVBench. All\nmodels and data are available at https://github.com/OpenGVLab/Ask-Anything.\n","authors":["Kunchang Li","Yali Wang","Yinan He","Yizhuo Li","Yi Wang","Yi Liu","Zun Wang","Jilan Xu","Guo Chen","Ping Luo","Limin Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2311.17005v1.pdf","comment":"18 pages, 7 figures, 19 tables"},{"id":"http://arxiv.org/abs/2311.17002v1","updated":"2023-11-28T17:57:44Z","published":"2023-11-28T17:57:44Z","title":"Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following","summary":" Existing text-to-image (T2I) diffusion models usually struggle in\ninterpreting complex prompts, especially those with quantity, object-attribute\nbinding, and multi-subject descriptions. In this work, we introduce a semantic\npanel as the middleware in decoding texts to images, supporting the generator\nto better follow instructions. The panel is obtained through arranging the\nvisual concepts parsed from the input text by the aid of large language models,\nand then injected into the denoising network as a detailed control signal to\ncomplement the text condition. To facilitate text-to-panel learning, we come up\nwith a carefully designed semantic formatting protocol, accompanied by a\nfully-automatic data preparation pipeline. Thanks to such a design, our\napproach, which we call Ranni, manages to enhance a pre-trained T2I generator\nregarding its textual controllability. More importantly, the introduction of\nthe generative middleware brings a more convenient form of interaction (i.e.,\ndirectly adjusting the elements in the panel or using language instructions)\nand further allows users to finely customize their generation, based on which\nwe develop a practical system and showcase its potential in continuous\ngeneration and chatting-based editing.\n","authors":["Yutong Feng","Biao Gong","Di Chen","Yujun Shen","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.17002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15347v2","updated":"2023-11-28T17:47:46Z","published":"2023-05-24T16:59:26Z","title":"A Tale of Two Features: Stable Diffusion Complements DINO for Zero-Shot\n Semantic Correspondence","summary":" Text-to-image diffusion models have made significant advances in generating\nand editing high-quality images. As a result, numerous approaches have explored\nthe ability of diffusion model features to understand and process single images\nfor downstream tasks, e.g., classification, semantic segmentation, and\nstylization. However, significantly less is known about what these features\nreveal across multiple, different images and objects. In this work, we exploit\nStable Diffusion (SD) features for semantic and dense correspondence and\ndiscover that with simple post-processing, SD features can perform\nquantitatively similar to SOTA representations. Interestingly, the qualitative\nanalysis reveals that SD features have very different properties compared to\nexisting representation learning features, such as the recently released\nDINOv2: while DINOv2 provides sparse but accurate matches, SD features provide\nhigh-quality spatial information but sometimes inaccurate semantic matches. We\ndemonstrate that a simple fusion of these two features works surprisingly well,\nand a zero-shot evaluation using nearest neighbors on these fused features\nprovides a significant performance gain over state-of-the-art methods on\nbenchmark datasets, e.g., SPair-71k, PF-Pascal, and TSS. We also show that\nthese correspondences can enable interesting applications such as instance\nswapping in two images.\n","authors":["Junyi Zhang","Charles Herrmann","Junhwa Hur","Luisa Polania Cabrera","Varun Jampani","Deqing Sun","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2305.15347v2.pdf","comment":"Accepted by NeurIPS 23, project page:\n https://sd-complements-dino.github.io/"},{"id":"http://arxiv.org/abs/2308.16847v2","updated":"2023-11-28T17:24:29Z","published":"2023-08-31T16:26:17Z","title":"Diffusion Models for Interferometric Satellite Aperture Radar","summary":" Probabilistic Diffusion Models (PDMs) have recently emerged as a very\npromising class of generative models, achieving high performance in natural\nimage generation. However, their performance relative to non-natural images,\nlike radar-based satellite data, remains largely unknown. Generating large\namounts of synthetic (and especially labelled) satellite data is crucial to\nimplement deep-learning approaches for the processing and analysis of\n(interferometric) satellite aperture radar data. Here, we leverage PDMs to\ngenerate several radar-based satellite image datasets. We show that PDMs\nsucceed in generating images with complex and realistic structures, but that\nsampling time remains an issue. Indeed, accelerated sampling strategies, which\nwork well on simple image datasets like MNIST, fail on our radar datasets. We\nprovide a simple and versatile open-source\nhttps://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and\nevaluate PDMs using any dataset on a single GPU.\n","authors":["Alexandre Tuel","Thomas Kerdreux","Claudia Hulbert","Bertrand Rouet-Leduc"],"pdf_url":"https://arxiv.org/pdf/2308.16847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16974v1","updated":"2023-11-28T17:22:17Z","published":"2023-11-28T17:22:17Z","title":"COLE: A Hierarchical Generation Framework for Graphic Design","summary":" Graphic design, which has been evolving since the 15th century, plays a\ncrucial role in advertising. The creation of high-quality designs demands\ncreativity, innovation, and lateral thinking. This intricate task involves\nunderstanding the objective, crafting visual elements such as the background,\ndecoration, font, color, and shape, formulating diverse professional layouts,\nand adhering to fundamental visual design principles. In this paper, we\nintroduce COLE, a hierarchical generation framework designed to comprehensively\naddress these challenges. This COLE system can transform a straightforward\nintention prompt into a high-quality graphic design, while also supporting\nflexible editing based on user input. Examples of such input might include\ndirectives like ``design a poster for Hisaishi's concert.'' The key insight is\nto dissect the complex task of text-to-design generation into a hierarchy of\nsimpler sub-tasks, each addressed by specialized models working\ncollaboratively. The results from these models are then consolidated to produce\na cohesive final output. Our hierarchical task decomposition can streamline the\ncomplex process and significantly enhance generation reliability. Our COLE\nsystem consists of multiple fine-tuned Large Language Models (LLMs), Large\nMultimodal Models (LMMs), and Diffusion Models (DMs), each specifically\ntailored for a design-aware text or image generation task. Furthermore, we\nconstruct the DESIGNERINTENTION benchmark to highlight the superiority of our\nCOLE over existing methods in generating high-quality graphic designs from user\nintent. We perceive our COLE as an important step towards addressing more\ncomplex visual design generation tasks in the future.\n","authors":["Peidong Jia","Chenxuan Li","Zeyu Liu","Yichao Shen","Xingru Chen","Yuhui Yuan","Yinglin Zheng","Dong Chen","Ji Li","Xiaodong Xie","Shanghang Zhang","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2311.16974v1.pdf","comment":"Technical report. Project page:\n https://graphic-design-generation.github.io/"},{"id":"http://arxiv.org/abs/2309.10399v2","updated":"2023-11-28T17:19:34Z","published":"2023-09-19T08:00:26Z","title":"Exploiting Causality Signals in Medical Images: A Pilot Study with\n Empirical Results","summary":" We present a novel technique to discover and exploit weak causal signals\ndirectly from images via neural networks for classification purposes. This way,\nwe model how the presence of a feature in one part of the image affects the\nappearance of another feature in a different part of the image. Our method\nconsists of a convolutional neural network backbone and a causality-factors\nextractor module, which computes weights to enhance each feature map according\nto its causal influence in the scene. We developed different architecture\nvariants and empirically evaluated all of our models on two public datasets of\nprostate MRI images and breast histopathology slides for cancer diagnosis. To\nconfirm our quantitative results, we conduct ablation studies and investigate\nthe explainability of our models via class activation maps. Our findings show\nthat our lightweight block extracts meaningful information and improves the\noverall classification, together with producing more robust predictions that\nfocus on relevant parts of the image. That is crucial in medical imaging, where\naccurate and reliable classifications are essential for effective diagnosis and\ntreatment planning.\n","authors":["Gianluca Carloni","Sara Colantonio"],"pdf_url":"https://arxiv.org/pdf/2309.10399v2.pdf","comment":"Repeated analyses with new dataset, provided more visual/algorithmic\n insights, improved clarity, remarked significance and novelty; 17 pages, 8\n figures, second round review"},{"id":"http://arxiv.org/abs/2311.08269v2","updated":"2023-11-28T17:18:44Z","published":"2023-11-14T16:02:18Z","title":"Defining the boundaries: challenges and advances in identifying cells in\n microscopy images","summary":" Segmentation, or the outlining of objects within images, is a critical step\nin the measurement and analysis of cells within microscopy images. While\nimprovements continue to be made in tools that rely on classical methods for\nsegmentation, deep learning-based tools increasingly dominate advances in the\ntechnology. Specialist models such as Cellpose continue to improve in accuracy\nand user-friendliness, and segmentation challenges such as the Multi-Modality\nCell Segmentation Challenge continue to push innovation in accuracy across\nwidely-varying test data as well as efficiency and usability. Increased\nattention on documentation, sharing, and evaluation standards are leading to\nincreased user-friendliness and acceleration towards the goal of a truly\nuniversal method.\n","authors":["Nodar Gogoberidze","Beth A. Cimini"],"pdf_url":"https://arxiv.org/pdf/2311.08269v2.pdf","comment":"12 pages, 1 figure, submitted to \"Current Opinion in Biotechnology\""},{"id":"http://arxiv.org/abs/2311.16961v1","updated":"2023-11-28T17:06:28Z","published":"2023-11-28T17:06:28Z","title":"HumanRef: Single Image to 3D Human Generation via Reference-Guided\n Diffusion","summary":" Generating a 3D human model from a single reference image is challenging\nbecause it requires inferring textures and geometries in invisible views while\nmaintaining consistency with the reference image. Previous methods utilizing 3D\ngenerative models are limited by the availability of 3D training data.\nOptimization-based methods that lift text-to-image diffusion models to 3D\ngeneration often fail to preserve the texture details of the reference image,\nresulting in inconsistent appearances in different views. In this paper, we\npropose HumanRef, a 3D human generation framework from a single-view input. To\nensure the generated 3D model is photorealistic and consistent with the input\nimage, HumanRef introduces a novel method called reference-guided score\ndistillation sampling (Ref-SDS), which effectively incorporates image guidance\ninto the generation process. Furthermore, we introduce region-aware attention\nto Ref-SDS, ensuring accurate correspondence between different body regions.\nExperimental results demonstrate that HumanRef outperforms state-of-the-art\nmethods in generating 3D clothed humans with fine geometry, photorealistic\ntextures, and view-consistent appearances.\n","authors":["Jingbo Zhang","Xiaoyu Li","Qi Zhang","Yanpei Cao","Ying Shan","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2311.16961v1.pdf","comment":"Homepage: https://eckertzhang.github.io/HumanRef.github.io/"},{"id":"http://arxiv.org/abs/2208.13465v2","updated":"2023-11-28T16:49:39Z","published":"2022-08-29T10:05:49Z","title":"Exploring Semantic Attributes from A Foundation Model for Federated\n Learning of Disjoint Label Spaces","summary":" Conventional centralised deep learning paradigms are not feasible when data\nfrom different sources cannot be shared due to data privacy or transmission\nlimitation. To resolve this problem, federated learning has been introduced to\ntransfer knowledge across multiple sources (clients) with non-shared data while\noptimising a globally generalised central model (server). Existing federated\nlearning paradigms mostly focus on transferring holistic high-level knowledge\n(such as class) across models, which are closely related to specific objects of\ninterest so may suffer from inverse attack. In contrast, in this work, we\nconsider transferring mid-level semantic knowledge (such as attribute) which is\nnot sensitive to specific objects of interest and therefore is more\nprivacy-preserving and scalable. To this end, we formulate a new Federated\nZero-Shot Learning (FZSL) paradigm to learn mid-level semantic knowledge at\nmultiple local clients with non-shared local data and cumulatively aggregate a\nglobally generalised central model for deployment. To improve model\ndiscriminative ability, we propose to explore semantic knowledge augmentation\nfrom external knowledge for enriching the mid-level semantic space in FZSL.\nExtensive experiments on five zeroshot learning benchmark datasets validate the\neffectiveness of our approach for optimising a generalisable federated learning\nmodel with mid-level semantic knowledge transfer.\n","authors":["Shitong Sun","Chenyang Si","Guile Wu","Shaogang Gong"],"pdf_url":"https://arxiv.org/pdf/2208.13465v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2311.16945v1","updated":"2023-11-28T16:47:59Z","published":"2023-11-28T16:47:59Z","title":"UC-NeRF: Neural Radiance Field for Under-Calibrated multi-view cameras\n in autonomous driving","summary":" Multi-camera setups find widespread use across various applications, such as\nautonomous driving, as they greatly expand sensing capabilities. Despite the\nfast development of Neural radiance field (NeRF) techniques and their wide\napplications in both indoor and outdoor scenes, applying NeRF to multi-camera\nsystems remains very challenging. This is primarily due to the inherent\nunder-calibration issues in multi-camera setup, including inconsistent imaging\neffects stemming from separately calibrated image signal processing units in\ndiverse cameras, and system errors arising from mechanical vibrations during\ndriving that affect relative camera poses. In this paper, we present UC-NeRF, a\nnovel method tailored for novel view synthesis in under-calibrated multi-view\ncamera systems. Firstly, we propose a layer-based color correction to rectify\nthe color inconsistency in different image regions. Second, we propose virtual\nwarping to generate more viewpoint-diverse but color-consistent virtual views\nfor color correction and 3D recovery. Finally, a spatiotemporally constrained\npose refinement is designed for more robust and accurate pose calibration in\nmulti-camera systems. Our method not only achieves state-of-the-art performance\nof novel view synthesis in multi-camera setups, but also effectively\nfacilitates depth estimation in large-scale outdoor scenes with the synthesized\nnovel views.\n","authors":["Kai Cheng","Xiaoxiao Long","Wei Yin","Jin Wang","Zhiqiang Wu","Yuexin Ma","Kaixuan Wang","Xiaozhi Chen","Xuejin Chen"],"pdf_url":"https://arxiv.org/pdf/2311.16945v1.pdf","comment":"See the project page for code, data:\n https://kcheng1021.github.io/ucnerf.github.io"},{"id":"http://arxiv.org/abs/2311.16943v1","updated":"2023-11-28T16:46:44Z","published":"2023-11-28T16:46:44Z","title":"Image segmentation with traveling waves in an exactly solvable recurrent\n neural network","summary":" We study image segmentation using spatiotemporal dynamics in a recurrent\nneural network where the state of each unit is given by a complex number. We\nshow that this network generates sophisticated spatiotemporal dynamics that can\neffectively divide an image into groups according to a scene's structural\ncharacteristics. Using an exact solution of the recurrent network's dynamics,\nwe present a precise description of the mechanism underlying object\nsegmentation in this network, providing a clear mathematical interpretation of\nhow the network performs this task. We then demonstrate a simple algorithm for\nobject segmentation that generalizes across inputs ranging from simple\ngeometric objects in grayscale images to natural images. Object segmentation\nacross all images is accomplished with one recurrent neural network that has a\nsingle, fixed set of weights. This demonstrates the expressive potential of\nrecurrent neural networks when constructed using a mathematical approach that\nbrings together their structure, dynamics, and computation.\n","authors":["Luisa H. B. Liboni","Roberto C. Budzinski","Alexandra N. Busch","Sindy Löwe","Thomas A. Keller","Max Welling","Lyle E. Muller"],"pdf_url":"https://arxiv.org/pdf/2311.16943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16941v1","updated":"2023-11-28T16:46:14Z","published":"2023-11-28T16:46:14Z","title":"Debiasing Multimodal Models via Causal Information Minimization","summary":" Most existing debiasing methods for multimodal models, including causal\nintervention and inference methods, utilize approximate heuristics to represent\nthe biases, such as shallow features from early stages of training or unimodal\nfeatures for multimodal tasks like VQA, etc., which may not be accurate. In\nthis paper, we study bias arising from confounders in a causal graph for\nmultimodal data and examine a novel approach that leverages causally-motivated\ninformation minimization to learn the confounder representations. Robust\npredictive features contain diverse information that helps a model generalize\nto out-of-distribution data. Hence, minimizing the information content of\nfeatures obtained from a pretrained biased model helps learn the simplest\npredictive features that capture the underlying data distribution. We treat\nthese features as confounder representations and use them via methods motivated\nby causal theory to remove bias from models. We find that the learned\nconfounder representations indeed capture dataset biases, and the proposed\ndebiasing methods improve out-of-distribution (OOD) performance on multiple\nmultimodal datasets without sacrificing in-distribution performance.\nAdditionally, we introduce a novel metric to quantify the sufficiency of\nspurious features in models' predictions that further demonstrates the\neffectiveness of our proposed methods. Our code is available at:\nhttps://github.com/Vaidehi99/CausalInfoMin\n","authors":["Vaidehi Patil","Adyasha Maharana","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2311.16941v1.pdf","comment":"EMNLP 2023 Findings (16 pages)"},{"id":"http://arxiv.org/abs/2208.02705v2","updated":"2023-11-28T16:45:07Z","published":"2022-08-04T15:06:29Z","title":"360Roam: Real-Time Indoor Roaming Using Geometry-Aware 360$^\\circ$\n Radiance Fields","summary":" Virtual tour among sparse 360$^\\circ$ images is widely used while hindering\nsmooth and immersive roaming experiences. The emergence of Neural Radiance\nField (NeRF) has showcased significant progress in synthesizing novel views,\nunlocking the potential for immersive scene exploration. Nevertheless, previous\nNeRF works primarily focused on object-centric scenarios, resulting in\nnoticeable performance degradation when applied to outward-facing and\nlarge-scale scenes due to limitations in scene parameterization. To achieve\nseamless and real-time indoor roaming, we propose a novel approach using\ngeometry-aware radiance fields with adaptively assigned local radiance fields.\nInitially, we employ multiple 360$^\\circ$ images of an indoor scene to\nprogressively reconstruct explicit geometry in the form of a probabilistic\noccupancy map, derived from a global omnidirectional radiance field.\nSubsequently, we assign local radiance fields through an adaptive\ndivide-and-conquer strategy based on the recovered geometry. By incorporating\ngeometry-aware sampling and decomposition of the global radiance field, our\nsystem effectively utilizes positional encoding and compact neural networks to\nenhance rendering quality and speed. Additionally, the extracted floorplan of\nthe scene aids in providing visual guidance, contributing to a realistic\nroaming experience. To demonstrate the effectiveness of our system, we curated\na diverse dataset of 360$^\\circ$ images encompassing various real-life scenes,\non which we conducted extensive experiments. Quantitative and qualitative\ncomparisons against baseline approaches illustrated the superior performance of\nour system in large-scale indoor scene roaming.\n","authors":["Huajian Huang","Yingshu Chen","Tianjia Zhang","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2208.02705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13411v2","updated":"2023-11-28T16:41:35Z","published":"2023-09-23T15:48:35Z","title":"Towards Attributions of Input Variables in a Coalition","summary":" This paper aims to develop a new attribution method to explain the conflict\nbetween individual variables' attributions and their coalition's attribution\nfrom a fully new perspective. First, we find that the Shapley value can be\nreformulated as the allocation of Harsanyi interactions encoded by the AI\nmodel. Second, based the re-alloction of interactions, we extend the Shapley\nvalue to the attribution of coalitions. Third we ective. We derive the\nfundamental mechanism behind the conflict. This conflict come from the\ninteraction containing partial variables in their coalition.\n","authors":["Xinhao Zheng","Huiqi Deng","Bo Fan","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.13411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16937v1","updated":"2023-11-28T16:39:49Z","published":"2023-11-28T16:39:49Z","title":"The Sky's the Limit: Re-lightable Outdoor Scenes via a Sky-pixel\n Constrained Illumination Prior and Outside-In Visibility","summary":" Inverse rendering of outdoor scenes from unconstrained image collections is a\nchallenging task, particularly illumination/albedo ambiguities and occlusion of\nthe illumination environment (shadowing) caused by geometry. However, there are\nmany cues in an image that can aid in the disentanglement of geometry, albedo\nand shadows. We exploit the fact that any sky pixel provides a direct\nmeasurement of distant lighting in the corresponding direction and, via a\nneural illumination prior, a statistical cue as to the remaining illumination\nenvironment. We also introduce a novel `outside-in' method for computing\ndifferentiable sky visibility based on a neural directional distance function.\nThis is efficient and can be trained in parallel with the neural scene\nrepresentation, allowing gradients from appearance loss to flow from shadows to\ninfluence estimation of illumination and geometry. Our method estimates\nhigh-quality albedo, geometry, illumination and sky visibility, achieving\nstate-of-the-art results on the NeRF-OSR relighting benchmark. Our code and\nmodels can be found https://github.com/JADGardner/neusky\n","authors":["James A. D. Gardner","Evgenii Kashin","Bernhard Egger","William A. P. Smith"],"pdf_url":"https://arxiv.org/pdf/2311.16937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16933v1","updated":"2023-11-28T16:33:08Z","published":"2023-11-28T16:33:08Z","title":"SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models","summary":" The development of text-to-video (T2V), i.e., generating videos with a given\ntext prompt, has been significantly advanced in recent years. However, relying\nsolely on text prompts often results in ambiguous frame composition due to\nspatial uncertainty. The research community thus leverages the dense structure\nsignals, e.g., per-frame depth/edge sequences, to enhance controllability,\nwhose collection accordingly increases the burden of inference. In this work,\nwe present SparseCtrl to enable flexible structure control with temporally\nsparse signals, requiring only one or a few inputs, as shown in Figure 1. It\nincorporates an additional condition encoder to process these sparse signals\nwhile leaving the pre-trained T2V model untouched. The proposed approach is\ncompatible with various modalities, including sketches, depth maps, and RGB\nimages, providing more practical control for video generation and promoting\napplications such as storyboarding, depth rendering, keyframe animation, and\ninterpolation. Extensive experiments demonstrate the generalization of\nSparseCtrl on both original and personalized T2V generators. Codes and models\nwill be publicly available at https://guoyww.github.io/projects/SparseCtrl .\n","authors":["Yuwei Guo","Ceyuan Yang","Anyi Rao","Maneesh Agrawala","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2311.16933v1.pdf","comment":"Project page: https://guoyww.github.io/projects/SparseCtrl"},{"id":"http://arxiv.org/abs/2310.03059v3","updated":"2023-11-28T16:31:34Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":" The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code will be released at\nhttps://github.com/Even-JK/PEFT-3D.\n","authors":["Ivan Tang","Ray Zhang","Zoey Guo","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2310.03059v3.pdf","comment":"10 pages. The specialized PEFT framework for 3D pre-trained models,\n which achieves competitive performance to full fine-tuning, and significantly\n reduces the computational resources. Project page:\n https://github.com/Even-JK/PEFT-3D"},{"id":"http://arxiv.org/abs/2311.16926v1","updated":"2023-11-28T16:31:27Z","published":"2023-11-28T16:31:27Z","title":"LLaFS: When Large-Language Models Meet Few-Shot Segmentation","summary":" This paper proposes LLaFS, the first attempt to leverage large language\nmodels (LLMs) in few-shot segmentation. In contrast to the conventional\nfew-shot segmentation methods that only rely on the limited and biased\ninformation from the annotated support images, LLaFS leverages the vast prior\nknowledge gained by LLM as an effective supplement and directly uses the LLM to\nsegment images in a few-shot manner. To enable the text-based LLM to handle\nimage-related tasks, we carefully design an input instruction that allows the\nLLM to produce segmentation results represented as polygons, and propose a\nregion-attribute table to simulate the human visual mechanism and provide\nmulti-modal guidance. We also synthesize pseudo samples and use curriculum\nlearning for pretraining to augment data and achieve better optimization. LLaFS\nachieves state-of-the-art results on multiple datasets, showing the potential\nof using LLMs for few-shot computer vision tasks. Code will be available at\nhttps://github.com/lanyunzhu99/LLaFS.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16923v1","updated":"2023-11-28T16:27:24Z","published":"2023-11-28T16:27:24Z","title":"Super-Resolution through StyleGAN Regularized Latent Search: A\n Realism-Fidelity Trade-off","summary":" This paper addresses the problem of super-resolution: constructing a highly\nresolved (HR) image from a low resolved (LR) one. Recent unsupervised\napproaches search the latent space of a StyleGAN pre-trained on HR images, for\nthe image that best downscales to the input LR image. However, they tend to\nproduce out-of-domain images and fail to accurately reconstruct HR images that\nare far from the original domain. Our contribution is twofold. Firstly, we\nintroduce a new regularizer to constrain the search in the latent space,\nensuring that the inverted code lies in the original image manifold. Secondly,\nwe further enhanced the reconstruction through expanding the image prior around\nthe optimal latent code. Our results show that the proposed approach recovers\nrealistic high-quality images for large magnification factors. Furthermore, for\nlow magnification factors, it can still reconstruct details that the generator\ncould not have produced otherwise. Altogether, our approach achieves a good\ntrade-off between fidelity and realism for the super-resolution task.\n","authors":["Marzieh Gheisari","Auguste Genovesio"],"pdf_url":"https://arxiv.org/pdf/2311.16923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16922v1","updated":"2023-11-28T16:26:35Z","published":"2023-11-28T16:26:35Z","title":"Mitigating Object Hallucinations in Large Vision-Language Models through\n Visual Contrastive Decoding","summary":" Large Vision-Language Models (LVLMs) have advanced considerably, intertwining\nvisual recognition and language understanding to generate content that is not\nonly coherent but also contextually attuned. Despite their success, LVLMs still\nsuffer from the issue of object hallucinations, where models generate plausible\nyet incorrect outputs that include objects that do not exist in the images. To\nmitigate this issue, we introduce Visual Contrastive Decoding (VCD), a simple\nand training-free method that contrasts output distributions derived from\noriginal and distorted visual inputs. The proposed VCD effectively reduces the\nover-reliance on statistical bias and unimodal priors, two essential causes of\nobject hallucinations. This adjustment ensures the generated content is closely\ngrounded to visual inputs, resulting in contextually accurate outputs. Our\nexperiments show that VCD, without either additional training or the usage of\nexternal tools, significantly mitigates the object hallucination issue across\ndifferent LVLM families. Beyond mitigating object hallucinations, VCD also\nexcels in general LVLM benchmarks, highlighting its wide-ranging applicability.\n","authors":["Sicong Leng","Hang Zhang","Guanzheng Chen","Xin Li","Shijian Lu","Chunyan Miao","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2311.16922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16918v1","updated":"2023-11-28T16:22:33Z","published":"2023-11-28T16:22:33Z","title":"RichDreamer: A Generalizable Normal-Depth Diffusion Model for Detail\n Richness in Text-to-3D","summary":" Lifting 2D diffusion for 3D generation is a challenging problem due to the\nlack of geometric prior and the complex entanglement of materials and lighting\nin natural images. Existing methods have shown promise by first creating the\ngeometry through score-distillation sampling (SDS) applied to rendered surface\nnormals, followed by appearance modeling. However, relying on a 2D RGB\ndiffusion model to optimize surface normals is suboptimal due to the\ndistribution discrepancy between natural images and normals maps, leading to\ninstability in optimization. In this paper, recognizing that the normal and\ndepth information effectively describe scene geometry and be automatically\nestimated from images, we propose to learn a generalizable Normal-Depth\ndiffusion model for 3D generation. We achieve this by training on the\nlarge-scale LAION dataset together with the generalizable image-to-depth and\nnormal prior models. In an attempt to alleviate the mixed illumination effects\nin the generated materials, we introduce an albedo diffusion model to impose\ndata-driven constraints on the albedo component. Our experiments show that when\nintegrated into existing text-to-3D pipelines, our models significantly enhance\nthe detail richness, achieving state-of-the-art results. Our project page is\nhttps://lingtengqiu.github.io/RichDreamer/.\n","authors":["Lingteng Qiu","Guanying Chen","Xiaodong Gu","Qi Zuo","Mutian Xu","Yushuang Wu","Weihao Yuan","Zilong Dong","Liefeng Bo","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2311.16918v1.pdf","comment":"Project Page: https://lingtengqiu.github.io/RichDreamer/"},{"id":"http://arxiv.org/abs/2311.16917v1","updated":"2023-11-28T16:20:33Z","published":"2023-11-28T16:20:33Z","title":"UGG: Unified Generative Grasping","summary":" Dexterous grasping aims to produce diverse grasping postures with a high\ngrasping success rate. Regression-based methods that directly predict grasping\nparameters given the object may achieve a high success rate but often lack\ndiversity. Generation-based methods that generate grasping postures conditioned\non the object can often produce diverse grasping, but they are insufficient for\nhigh grasping success due to lack of discriminative information. To mitigate,\nwe introduce a unified diffusion-based dexterous grasp generation model, dubbed\nthe name UGG, which operates within the object point cloud and hand parameter\nspaces. Our all-transformer architecture unifies the information from the\nobject, the hand, and the contacts, introducing a novel representation of\ncontact points for improved contact modeling. The flexibility and quality of\nour model enable the integration of a lightweight discriminator, benefiting\nfrom simulated discriminative data, which pushes for a high success rate while\npreserving high diversity. Beyond grasp generation, our model can also generate\nobjects based on hand information, offering valuable insights into object\ndesign and studying how the generative model perceives objects. Our model\nachieves state-of-the-art dexterous grasping on the large-scale DexGraspNet\ndataset while facilitating human-centric object design, marking a significant\nadvancement in dexterous grasping research. Our project page is\nhttps://jiaxin-lu.github.io/ugg/ .\n","authors":["Jiaxin Lu","Hao Kang","Haoxiang Li","Bo Liu","Yiding Yang","Qixing Huang","Gang Hua"],"pdf_url":"https://arxiv.org/pdf/2311.16917v1.pdf","comment":"17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2311.16914v1","updated":"2023-11-28T16:16:10Z","published":"2023-11-28T16:16:10Z","title":"Brain-ID: Learning Robust Feature Representations for Brain Imaging","summary":" Recent learning-based approaches have made astonishing advances in calibrated\nmedical imaging like computerized tomography, yet they struggle to generalize\nin uncalibrated modalities -- notoriously magnetic resonance imaging (MRI),\nwhere performance is highly sensitive to the differences in MR contrast,\nresolution, and orientation between the training and testing data. This\nprevents broad applicability to the diverse clinical acquisition protocols in\nthe real world. We introduce Brain-ID, a robust feature representation learning\nstrategy for brain imaging, which is contrast-agnostic, and robust to the brain\nanatomy of each subject regardless of the appearance of acquired images (i.e.,\ndeformation, contrast, resolution, orientation, artifacts, etc). Brain-ID is\ntrained entirely on synthetic data, and easily adapts to downstream tasks with\nour proposed simple one-layer solution. We validate the robustness of Brain-ID\nfeatures, and evaluate their performance in a variety of downstream\napplications, including both contrast-independent (anatomy\nreconstruction/contrast synthesis, brain segmentation), and contrast-dependent\n(super-resolution, bias field estimation) tasks. Extensive experiments on 6\npublic datasets demonstrate that Brain-ID achieves state-of-the-art performance\nin all tasks, and more importantly, preserves its performance when only limited\ntraining data is available.\n","authors":["Peirong Liu","Oula Puonti","Xiaoling Hu","Daniel C. Alexander","Juan Eugenio Iglesias"],"pdf_url":"https://arxiv.org/pdf/2311.16914v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.16900v1","updated":"2023-11-28T15:58:13Z","published":"2023-11-28T15:58:13Z","title":"Lane-Keeping Control of Autonomous Vehicles Through a Soft-Constrained\n Iterative LQR","summary":" The accurate prediction of smooth steering inputs is crucial for autonomous\nvehicle applications because control actions with jitter might cause the\nvehicle system to become unstable. To address this problem in automobile\nlane-keeping control without the use of additional smoothing algorithms, we\ndeveloped a soft-constrained iterative linear-quadratic regulator (soft-CILQR)\nalgorithm by integrating CILQR algorithm and a model predictive control (MPC)\nconstraint relaxation method. We incorporated slack variables into the state\nand control barrier functions of the soft-CILQR solver to soften the\nconstraints in the optimization process so that stabilizing control inputs can\nbe calculated in a relatively simple manner. Two types of automotive\nlane-keeping experiments were conducted with a linear system dynamics model to\ntest the performance of the proposed soft-CILQR algorithm and to compare its\nperformance with that of the CILQR algorithm: numerical simulations and\nexperiments involving challenging vision-based maneuvers. In the numerical\nsimulations, the soft-CILQR and CILQR solvers managed to drive the system\ntoward the reference state asymptotically; however, the soft-CILQR solver\nobtained smooth steering input trajectories more easily than did the CILQR\nsolver under conditions involving additive disturbances. In the experiments\nwith visual inputs, the soft-CILQR controller outperformed the CILQR controller\nin terms of tracking accuracy and steering smoothness during the driving of an\nego vehicle on TORCS.\n","authors":["Der-Hau Lee"],"pdf_url":"https://arxiv.org/pdf/2311.16900v1.pdf","comment":"11 figures, 10 pages"},{"id":"http://arxiv.org/abs/2310.06627v3","updated":"2023-11-28T15:57:16Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16894v1","updated":"2023-11-28T15:46:12Z","published":"2023-11-28T15:46:12Z","title":"Dendrogram distance: an evaluation metric for generative networks using\n hierarchical clustering","summary":" We present a novel metric for generative modeling evaluation, focusing\nprimarily on generative networks. The method uses dendrograms to represent real\nand fake data, allowing for the divergence between training and generated\nsamples to be computed. This metric focus on mode collapse, targeting\ngenerators that are not able to capture all modes in the training set. To\nevaluate the proposed method it is introduced a validation scheme based on\nsampling from real datasets, therefore the metric is evaluated in a controlled\nenvironment and proves to be competitive with other state-of-the-art\napproaches.\n","authors":["Gustavo Sutter Carvalho","Moacir Antonelli Ponti"],"pdf_url":"https://arxiv.org/pdf/2311.16894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13131v2","updated":"2023-11-28T15:41:46Z","published":"2022-11-23T17:04:20Z","title":"FeTrIL: Feature Translation for Exemplar-Free Class-Incremental Learning","summary":" Exemplar-free class-incremental learning is very challenging due to the\nnegative effect of catastrophic forgetting. A balance between stability and\nplasticity of the incremental process is needed in order to obtain good\naccuracy for past as well as new classes. Existing exemplar-free\nclass-incremental methods focus either on successive fine tuning of the model,\nthus favoring plasticity, or on using a feature extractor fixed after the\ninitial incremental state, thus favoring stability. We introduce a method which\ncombines a fixed feature extractor and a pseudo-features generator to improve\nthe stability-plasticity balance. The generator uses a simple yet effective\ngeometric translation of new class features to create representations of past\nclasses, made of pseudo-features. The translation of features only requires the\nstorage of the centroid representations of past classes to produce their\npseudo-features. Actual features of new classes and pseudo-features of past\nclasses are fed into a linear classifier which is trained incrementally to\ndiscriminate between all classes. The incremental process is much faster with\nthe proposed method compared to mainstream ones which update the entire deep\nmodel. Experiments are performed with three challenging datasets, and different\nincremental settings. A comparison with ten existing methods shows that our\nmethod outperforms the others in most cases.\n","authors":["Grégoire Petit","Adrian Popescu","Hugo Schindler","David Picard","Bertrand Delezoide"],"pdf_url":"https://arxiv.org/pdf/2211.13131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05521v2","updated":"2023-11-28T15:31:46Z","published":"2023-11-09T17:05:53Z","title":"BakedAvatar: Baking Neural Fields for Real-Time Head Avatar Synthesis","summary":" Synthesizing photorealistic 4D human head avatars from videos is essential\nfor VR/AR, telepresence, and video game applications. Although existing Neural\nRadiance Fields (NeRF)-based methods achieve high-fidelity results, the\ncomputational expense limits their use in real-time applications. To overcome\nthis limitation, we introduce BakedAvatar, a novel representation for real-time\nneural head avatar synthesis, deployable in a standard polygon rasterization\npipeline. Our approach extracts deformable multi-layer meshes from learned\nisosurfaces of the head and computes expression-, pose-, and view-dependent\nappearances that can be baked into static textures for efficient rasterization.\nWe thus propose a three-stage pipeline for neural head avatar synthesis, which\nincludes learning continuous deformation, manifold, and radiance fields,\nextracting layered meshes and textures, and fine-tuning texture details with\ndifferential rasterization. Experimental results demonstrate that our\nrepresentation generates synthesis results of comparable quality to other\nstate-of-the-art methods while significantly reducing the inference time\nrequired. We further showcase various head avatar synthesis results from\nmonocular videos, including view synthesis, face reenactment, expression\nediting, and pose editing, all at interactive frame rates.\n","authors":["Hao-Bin Duan","Miao Wang","Jin-Chuan Shi","Xu-Chuan Chen","Yan-Pei Cao"],"pdf_url":"https://arxiv.org/pdf/2311.05521v2.pdf","comment":"ACM Transactions on Graphics (SIGGRAPH Asia 2023). Project Page:\n https://buaavrcg.github.io/BakedAvatar"},{"id":"http://arxiv.org/abs/2311.16882v1","updated":"2023-11-28T15:31:11Z","published":"2023-11-28T15:31:11Z","title":"Optimisation-Based Multi-Modal Semantic Image Editing","summary":" Image editing affords increased control over the aesthetics and content of\ngenerated images. Pre-existing works focus predominantly on text-based\ninstructions to achieve desired image modifications, which limit edit precision\nand accuracy. In this work, we propose an inference-time editing optimisation,\ndesigned to extend beyond textual edits to accommodate multiple editing\ninstruction types (e.g. spatial layout-based; pose, scribbles, edge maps). We\npropose to disentangle the editing task into two competing subtasks: successful\nlocal image modifications and global content consistency preservation, where\nsubtasks are guided through two dedicated loss functions. By allowing to adjust\nthe influence of each loss function, we build a flexible editing solution that\ncan be adjusted to user preferences. We evaluate our method using text, pose\nand scribble edit conditions, and highlight our ability to achieve complex\nedits, through both qualitative and quantitative experiments.\n","authors":["Bowen Li","Yongxin Yang","Steven McDonagh","Shifeng Zhang","Petru-Daniel Tudosiu","Sarah Parisot"],"pdf_url":"https://arxiv.org/pdf/2311.16882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.08289v2","updated":"2023-11-28T15:31:04Z","published":"2022-09-17T09:05:47Z","title":"Continuously Controllable Facial Expression Editing in Talking Face\n Videos","summary":" Recently audio-driven talking face video generation has attracted\nconsiderable attention. However, very few researches address the issue of\nemotional editing of these talking face videos with continuously controllable\nexpressions, which is a strong demand in the industry. The challenge is that\nspeech-related expressions and emotion-related expressions are often highly\ncoupled. Meanwhile, traditional image-to-image translation methods cannot work\nwell in our application due to the coupling of expressions with other\nattributes such as poses, i.e., translating the expression of the character in\neach frame may simultaneously change the head pose due to the bias of the\ntraining data distribution. In this paper, we propose a high-quality facial\nexpression editing method for talking face videos, allowing the user to control\nthe target emotion in the edited video continuously. We present a new\nperspective for this task as a special case of motion information editing,\nwhere we use a 3DMM to capture major facial movements and an associated texture\nmap modeled by a StyleGAN to capture appearance details. Both representations\n(3DMM and texture map) contain emotional information and can be continuously\nmodified by neural networks and easily smoothed by averaging in\ncoefficient/latent spaces, making our method simple yet effective. We also\nintroduce a mouth shape preservation loss to control the trade-off between lip\nsynchronization and the degree of exaggeration of the edited expression.\nExtensive experiments and a user study show that our method achieves\nstate-of-the-art performance across various evaluation criteria.\n","authors":["Zhiyao Sun","Yu-Hui Wen","Tian Lv","Yanan Sun","Ziyang Zhang","Yaoyuan Wang","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2209.08289v2.pdf","comment":"Accepted by IEEE Transactions on Affective Computing (DOI:\n 10.1109/TAFFC.2023.3334511). Demo video: https://youtu.be/WD-bNVya6kM .\n Project page: https://raineggplant.github.io/FEE4TV"},{"id":"http://arxiv.org/abs/2311.14829v2","updated":"2023-11-28T15:27:26Z","published":"2023-11-24T19:56:01Z","title":"Proximal Algorithms for Accelerated Langevin Dynamics","summary":" We develop a novel class of MCMC algorithms based on a stochastized Nesterov\nscheme. With an appropriate addition of noise, the result is a\ntime-inhomogeneous underdamped Langevin equation, which we prove emits a\nspecified target distribution as its invariant measure. Convergence rates to\nstationarity under Wasserstein-2 distance are established as well.\nMetropolis-adjusted and stochastic gradient versions of the proposed Langevin\ndynamics are also provided. Experimental illustrations show superior\nperformance of the proposed method over typical Langevin samplers for different\nmodels in statistics and image processing including better mixing of the\nresulting Markov chains.\n","authors":["Duy H. Thai","Alexander L. Young","David B. Dunson"],"pdf_url":"https://arxiv.org/pdf/2311.14829v2.pdf","comment":"The technical proofs for the paper will be revised"},{"id":"http://arxiv.org/abs/2311.16854v1","updated":"2023-11-28T15:03:53Z","published":"2023-11-28T15:03:53Z","title":"A Unified Approach for Text- and Image-guided 4D Scene Generation","summary":" Large-scale diffusion generative models are greatly simplifying image, video\nand 3D asset creation from user-provided text prompts and images. However, the\nchallenging problem of text-to-4D dynamic 3D scene generation with diffusion\nguidance remains largely unexplored. We propose Dream-in-4D, which features a\nnovel two-stage approach for text-to-4D synthesis, leveraging (1) 3D and 2D\ndiffusion guidance to effectively learn a high-quality static 3D asset in the\nfirst stage; (2) a deformable neural radiance field that explicitly\ndisentangles the learned static asset from its deformation, preserving quality\nduring motion learning; and (3) a multi-resolution feature grid for the\ndeformation field with a displacement total variation loss to effectively learn\nmotion with video diffusion guidance in the second stage. Through a user\npreference study, we demonstrate that our approach significantly advances image\nand motion quality, 3D consistency and text fidelity for text-to-4D generation\ncompared to baseline approaches. Thanks to its motion-disentangled\nrepresentation, Dream-in-4D can also be easily adapted for controllable\ngeneration where appearance is defined by one or multiple images, without the\nneed to modify the motion learning stage. Thus, our method offers, for the\nfirst time, a unified approach for text-to-4D, image-to-4D and personalized 4D\ngeneration tasks.\n","authors":["Yufeng Zheng","Xueting Li","Koki Nagano","Sifei Liu","Otmar Hilliges","Shalini De Mello"],"pdf_url":"https://arxiv.org/pdf/2311.16854v1.pdf","comment":"Project page: https://dream-in-4d.github.io/dream-in-4D/"},{"id":"http://arxiv.org/abs/2311.16845v1","updated":"2023-11-28T14:58:32Z","published":"2023-11-28T14:58:32Z","title":"Wavelet-based Fourier Information Interaction with Frequency Diffusion\n Adjustment for Underwater Image Restoration","summary":" Underwater images are subject to intricate and diverse degradation,\ninevitably affecting the effectiveness of underwater visual tasks. However,\nmost approaches primarily operate in the raw pixel space of images, which\nlimits the exploration of the frequency characteristics of underwater images,\nleading to an inadequate utilization of deep models' representational\ncapabilities in producing high-quality images. In this paper, we introduce a\nnovel Underwater Image Enhancement (UIE) framework, named WF-Diff, designed to\nfully leverage the characteristics of frequency domain information and\ndiffusion models. WF-Diff consists of two detachable networks: Wavelet-based\nFourier information interaction network (WFI2-net) and Frequency Residual\nDiffusion Adjustment Module (FRDAM). With our full exploration of the frequency\ndomain information, WFI2-net aims to achieve preliminary enhancement of\nfrequency information in the wavelet space. Our proposed FRDAM can further\nrefine the high- and low-frequency information of the initial enhanced images,\nwhich can be viewed as a plug-and-play universal module to adjust the detail of\nthe underwater images. With the above techniques, our algorithm can show SOTA\nperformance on real-world underwater image datasets, and achieves competitive\nperformance in visual quality.\n","authors":["Chen Zhao","Weiling Cai","Chenyu Dong","Chengwei Hu"],"pdf_url":"https://arxiv.org/pdf/2311.16845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16843v1","updated":"2023-11-28T14:57:14Z","published":"2023-11-28T14:57:14Z","title":"Self-training solutions for the ICCV 2023 GeoNet Challenge","summary":" GeoNet is a recently proposed domain adaptation benchmark consisting of three\nchallenges (i.e., GeoUniDA, GeoImNet, and GeoPlaces). Each challenge contains\nimages collected from the USA and Asia where there are huge geographical gaps.\nOur solution adopts a two-stage source-free domain adaptation framework with a\nSwin Transformer backbone to achieve knowledge transfer from the USA (source)\ndomain to Asia (target) domain. In the first stage, we train a source model\nusing labeled source data with a re-sampling strategy and two types of\ncross-entropy loss. In the second stage, we generate pseudo labels for\nunlabeled target data to fine-tune the model. Our method achieves an H-score of\n74.56% and ultimately ranks 1st in the GeoUniDA challenge. In GeoImNet and\nGeoPlaces challenges, our solution also reaches a top-3 accuracy of 64.46% and\n51.23%, respectively.\n","authors":["Lijun Sheng","Zhengbo Wang","Jian Liang"],"pdf_url":"https://arxiv.org/pdf/2311.16843v1.pdf","comment":"technical report; 1st in the ICCV-2023 GeoUniDA challenge"},{"id":"http://arxiv.org/abs/2311.16839v1","updated":"2023-11-28T14:54:37Z","published":"2023-11-28T14:54:37Z","title":"Beyond Hallucinations: Enhancing LVLMs through Hallucination-Aware\n Direct Preference Optimization","summary":" Multimodal large language models have made significant advancements in recent\nyears, yet they still suffer from a common issue known as the \"hallucination\nproblem\" where the models generate textual descriptions that contain inaccurate\nor non-existent content from the image. To address this issue, this paper\nintroduces a novel strategy: Hallucination-Aware Direct Preference Optimization\n(HA-DPO). Our approach treats the hallucination problem as a unique preference\nselection issue, where the model is trained to favor the non-hallucinating\nresponse when presented with two responses of the same image (one accurate and\none hallucinating). This paper also presents an efficient process for\nconstructing hallucination sample pairs to ensure high-quality,\nstyle-consistent pairs for stable HA-DPO training. We applied this strategy to\ntwo mainstream multimodal models, and the results showed a significant\nreduction in the hallucination problem and an enhancement in the models'\ngeneralization capabilities. With HA-DPO, the MiniGPT-4 model demonstrates\nsignificant advancements: POPE accuracy increases from 51.13% to 85.66% (34.5%\nabsolute improvement), and the MME score escalates from 968.58 to 1365.76 (41%\nrelative improvement). The code, models, and datasets will be made publicly\navailable.\n","authors":["Zhiyuan Zhao","Bin Wang","Linke Ouyang","Xiaoyi Dong","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2311.16839v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.11957v4","updated":"2023-11-28T14:53:32Z","published":"2023-07-22T01:56:58Z","title":"High-performance real-world optical computing trained by in situ\n model-free optimization","summary":" Optical computing systems provide high-speed and low-energy data processing\nbut face deficiencies in computationally demanding training and\nsimulation-to-reality gaps. We propose a model-free optimization (MFO) method\nbased on a score gradient estimation algorithm for computationally efficient in\nsitu training of optical computing systems. This approach treats an optical\ncomputing system as a black box and back-propagates the loss directly to the\noptical computing weights' probability distributions, circumventing the need\nfor a computationally heavy and biased system simulation. Our experiments on a\nsingle-layer diffractive optical computing system show that MFO outperforms\nhybrid training on the MNIST and FMNIST datasets. Furthermore, we demonstrate\nimage-free and high-speed classification of cells from their phase maps. Our\nmethod's model-free and high-performance nature, combined with its low demand\nfor computational resources, expedites the transition of optical computing from\nlaboratory demonstrations to real-world applications.\n","authors":["Guangyuan Zhao","Xin Shu"],"pdf_url":"https://arxiv.org/pdf/2307.11957v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16835v1","updated":"2023-11-28T14:51:08Z","published":"2023-11-28T14:51:08Z","title":"Unified-modal Salient Object Detection via Adaptive Prompt Learning","summary":" Existing single-modal and multi-modal salient object detection (SOD) methods\nfocus on designing specific architectures tailored for their respective tasks.\nHowever, developing completely different models for different tasks leads to\nlabor and time consumption, as well as high computational and practical\ndeployment costs. In this paper, we make the first attempt to address both\nsingle-modal and multi-modal SOD in a unified framework called UniSOD.\nNevertheless, assigning appropriate strategies to modality variable inputs is\nchallenging. To this end, UniSOD learns modality-aware prompts with\ntask-specific hints through adaptive prompt learning, which are plugged into\nthe proposed pre-trained baseline SOD model to handle corresponding tasks,\nwhile only requiring few learnable parameters compared to training the entire\nmodel. Each modality-aware prompt is generated from a switchable prompt\ngeneration block, which performs structural switching solely relied on\nsingle-modal and multi-modal inputs. UniSOD achieves consistent performance\nimprovement on 14 benchmark datasets for RGB, RGB-D, and RGB-T SOD, which\ndemonstrates that our method effectively and efficiently unifies single-modal\nand multi-modal SOD tasks.\n","authors":["Kunpeng Wang","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2311.16835v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.16833v1","updated":"2023-11-28T14:50:50Z","published":"2023-11-28T14:50:50Z","title":"1-Lipschitz Layers Compared: Memory, Speed, and Certifiable Robustness","summary":" The robustness of neural networks against input perturbations with bounded\nmagnitude represents a serious concern in the deployment of deep learning\nmodels in safety-critical systems. Recently, the scientific community has\nfocused on enhancing certifiable robustness guarantees by crafting 1-Lipschitz\nneural networks that leverage Lipschitz bounded dense and convolutional layers.\nAlthough different methods have been proposed in the literature to achieve this\ngoal, understanding the performance of such methods is not straightforward,\nsince different metrics can be relevant (e.g., training time, memory usage,\naccuracy, certifiable robustness) for different applications. For this reason,\nthis work provides a thorough theoretical and empirical comparison between\nmethods by evaluating them in terms of memory usage, speed, and certifiable\nrobust accuracy. The paper also provides some guidelines and recommendations to\nsupport the user in selecting the methods that work best depending on the\navailable resources. We provide code at\nhttps://github.com/berndprach/1LipschitzLayersCompared.\n","authors":["Bernd Prach","Fabio Brau","Giorgio Buttazzo","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2311.16833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13847v2","updated":"2023-11-28T14:49:54Z","published":"2023-11-23T08:31:11Z","title":"Perceptual Image Compression with Cooperative Cross-Modal Side\n Information","summary":" The explosion of data has resulted in more and more associated text being\ntransmitted along with images. Inspired by from distributed source coding, many\nworks utilize image side information to enhance image compression. However,\nexisting methods generally do not consider using text as side information to\nenhance perceptual compression of images, even though the benefits of\nmultimodal synergy have been widely demonstrated in research. This begs the\nfollowing question: How can we effectively transfer text-level semantic\ndependencies to help image compression, which is only available to the decoder?\nIn this work, we propose a novel deep image compression method with text-guided\nside information to achieve a better rate-perception-distortion tradeoff.\nSpecifically, we employ the CLIP text encoder and an effective Semantic-Spatial\nAware block to fuse the text and image features. This is done by predicting a\nsemantic mask to guide the learned text-adaptive affine transformation at the\npixel level. Furthermore, we design a text-conditional generative adversarial\nnetworks to improve the perceptual quality of reconstructed images. Extensive\nexperiments involving four datasets and ten image quality assessment metrics\ndemonstrate that the proposed approach achieves superior results in terms of\nrate-perception trade-off and semantic distortion.\n","authors":["Shiyu Qin","Bin Chen","Yujun Huang","Baoyi An","Tao Dai","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2311.13847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16829v1","updated":"2023-11-28T14:48:22Z","published":"2023-11-28T14:48:22Z","title":"Decomposer: Semi-supervised Learning of Image Restoration and Image\n Decomposition","summary":" We present Decomposer, a semi-supervised reconstruction model that decomposes\ndistorted image sequences into their fundamental building blocks - the original\nimage and the applied augmentations, i.e., shadow, light, and occlusions. To\nsolve this problem, we use the SIDAR dataset that provides a large number of\ndistorted image sequences: each sequence contains images with shadows,\nlighting, and occlusions applied to an undistorted version. Each distortion\nchanges the original signal in different ways, e.g., additive or multiplicative\nnoise. We propose a transformer-based model to explicitly learn this\ndecomposition. The sequential model uses 3D Swin-Transformers for\nspatio-temporal encoding and 3D U-Nets as prediction heads for individual parts\nof the decomposition. We demonstrate that by separately pre-training our model\non weakly supervised pseudo labels, we can steer our model to optimize for our\nambiguous problem definition and learn to differentiate between the different\nimage distortions.\n","authors":["Boris Meinardus","Mariusz Trzeciakiewicz","Tim Herzig","Monika Kwiatkowski","Simon Matern","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2311.16829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16828v1","updated":"2023-11-28T14:46:51Z","published":"2023-11-28T14:46:51Z","title":"SARA: Controllable Makeup Transfer with Spatial Alignment and\n Region-Adaptive Normalization","summary":" Makeup transfer is a process of transferring the makeup style from a\nreference image to the source images, while preserving the source images'\nidentities. This technique is highly desirable and finds many applications.\nHowever, existing methods lack fine-level control of the makeup style, making\nit challenging to achieve high-quality results when dealing with large spatial\nmisalignments. To address this problem, we propose a novel Spatial Alignment\nand Region-Adaptive normalization method (SARA) in this paper. Our method\ngenerates detailed makeup transfer results that can handle large spatial\nmisalignments and achieve part-specific and shade-controllable makeup transfer.\nSpecifically, SARA comprises three modules: Firstly, a spatial alignment module\nthat preserves the spatial context of makeup and provides a target semantic map\nfor guiding the shape-independent style codes. Secondly, a region-adaptive\nnormalization module that decouples shape and makeup style using per-region\nencoding and normalization, which facilitates the elimination of spatial\nmisalignments. Lastly, a makeup fusion module blends identity features and\nmakeup style by injecting learned scale and bias parameters. Experimental\nresults show that our SARA method outperforms existing methods and achieves\nstate-of-the-art performance on two public datasets.\n","authors":["Xiaojing Zhong","Xinyi Huang","Zhonghua Wu","Guosheng Lin","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2311.16828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14153v3","updated":"2023-11-28T14:37:13Z","published":"2023-06-25T07:40:39Z","title":"DomainStudio: Fine-Tuning Diffusion Models for Domain-Driven Image\n Generation using Limited Data","summary":" Denoising diffusion probabilistic models (DDPMs) have been proven capable of\nsynthesizing high-quality images with remarkable diversity when trained on\nlarge amounts of data. Typical diffusion models and modern large-scale\nconditional generative models like text-to-image generative models are\nvulnerable to overfitting when fine-tuned on extremely limited data. Existing\nworks have explored subject-driven generation using a reference set containing\na few images. However, few prior works explore DDPM-based domain-driven\ngeneration, which aims to learn the common features of target domains while\nmaintaining diversity. This paper proposes a novel DomainStudio approach to\nadapt DDPMs pre-trained on large-scale source datasets to target domains using\nlimited data. It is designed to keep the diversity of subjects provided by\nsource domains and get high-quality and diverse adapted samples in target\ndomains. We propose to keep the relative distances between adapted samples to\nachieve considerable generation diversity. In addition, we further enhance the\nlearning of high-frequency details for better generation quality. Our approach\nis compatible with both unconditional and conditional diffusion models. This\nwork makes the first attempt to realize unconditional few-shot image generation\nwith diffusion models, achieving better quality and greater diversity than\ncurrent state-of-the-art GAN-based approaches. Moreover, this work also\nsignificantly relieves overfitting for conditional generation and realizes\nhigh-quality domain-driven generation, further expanding the applicable\nscenarios of modern large-scale text-to-image models.\n","authors":["Jingyuan Zhu","Huimin Ma","Jiansheng Chen","Jian Yuan"],"pdf_url":"https://arxiv.org/pdf/2306.14153v3.pdf","comment":"extended from DDPM-PA (arXiv:2211.03264), 33 pages, 34 figures"},{"id":"http://arxiv.org/abs/2311.16821v1","updated":"2023-11-28T14:34:04Z","published":"2023-11-28T14:34:04Z","title":"Denoising Diffusion Probabilistic Models for Image Inpainting of Cell\n Distributions in the Human Brain","summary":" Recent advances in imaging and high-performance computing have made it\npossible to image the entire human brain at the cellular level. This is the\nbasis to study the multi-scale architecture of the brain regarding its\nsubdivision into brain areas and nuclei, cortical layers, columns, and cell\nclusters down to single cell morphology Methods for brain mapping and cell\nsegmentation exploit such images to enable rapid and automated analysis of\ncytoarchitecture and cell distribution in complete series of histological\nsections. However, the presence of inevitable processing artifacts in the image\ndata caused by missing sections, tears in the tissue, or staining variations\nremains the primary reason for gaps in the resulting image data. To this end we\naim to provide a model that can fill in missing information in a reliable way,\nfollowing the true cell distribution at different scales. Inspired by the\nrecent success in image generation, we propose a denoising diffusion\nprobabilistic model (DDPM), trained on light-microscopic scans of cell-body\nstained sections. We extend this model with the RePaint method to impute\nmissing or replace corrupted image data. We show that our trained DDPM is able\nto generate highly realistic image information for this purpose, generating\nplausible cell statistics and cytoarchitectonic patterns. We validate its\noutputs using two established downstream task models trained on the same data.\n","authors":["Jan-Oliver Kropp","Christian Schiffer","Katrin Amunts","Timo Dickscheid"],"pdf_url":"https://arxiv.org/pdf/2311.16821v1.pdf","comment":"Submitted to ISBI-2024"},{"id":"http://arxiv.org/abs/2311.13846v2","updated":"2023-11-28T14:31:43Z","published":"2023-11-23T08:29:32Z","title":"Progressive Learning with Visual Prompt Tuning for Variable-Rate Image\n Compression","summary":" In this paper, we propose a progressive learning paradigm for\ntransformer-based variable-rate image compression. Our approach covers a wide\nrange of compression rates with the assistance of the Layer-adaptive Prompt\nModule (LPM). Inspired by visual prompt tuning, we use LPM to extract prompts\nfor input images and hidden features at the encoder side and decoder side,\nrespectively, which are fed as additional information into the Swin Transformer\nlayer of a pre-trained transformer-based image compression model to affect the\nallocation of attention region and the bits, which in turn changes the target\ncompression ratio of the model. To ensure the network is more lightweight, we\ninvolves the integration of prompt networks with less convolutional layers.\nExhaustive experiments show that compared to methods based on multiple models,\nwhich are optimized separately for different target rates, the proposed method\narrives at the same performance with 80% savings in parameter storage and 90%\nsavings in datasets. Meanwhile, our model outperforms all current variable\nbitrate image methods in terms of rate-distortion performance and approaches\nthe state-of-the-art fixed bitrate image compression methods trained from\nscratch.\n","authors":["Shiyu Qin","Yimin Zhou","Jinpeng Wang","Bin Chen","Baoyi An","Tao Dai","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2311.13846v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16818v1","updated":"2023-11-28T14:28:41Z","published":"2023-11-28T14:28:41Z","title":"DI-Net : Decomposed Implicit Garment Transfer Network for Digital\n Clothed 3D Human","summary":" 3D virtual try-on enjoys many potential applications and hence has attracted\nwide attention. However, it remains a challenging task that has not been\nadequately solved. Existing 2D virtual try-on methods cannot be directly\nextended to 3D since they lack the ability to perceive the depth of each pixel.\nBesides, 3D virtual try-on approaches are mostly built on the fixed topological\nstructure and with heavy computation. To deal with these problems, we propose a\nDecomposed Implicit garment transfer network (DI-Net), which can effortlessly\nreconstruct a 3D human mesh with the newly try-on result and preserve the\ntexture from an arbitrary perspective. Specifically, DI-Net consists of two\nmodules: 1) A complementary warping module that warps the reference image to\nhave the same pose as the source image through dense correspondence learning\nand sparse flow learning; 2) A geometry-aware decomposed transfer module that\ndecomposes the garment transfer into image layout based transfer and texture\nbased transfer, achieving surface and texture reconstruction by constructing\npixel-aligned implicit functions. Experimental results show the effectiveness\nand superiority of our method in the 3D virtual try-on task, which can yield\nmore high-quality results over other existing methods.\n","authors":["Xiaojing Zhong","Yukun Su","Zhonghua Wu","Guosheng Lin","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2311.16818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16813v1","updated":"2023-11-28T14:22:24Z","published":"2023-11-28T14:22:24Z","title":"Panacea: Panoramic and Controllable Video Generation for Autonomous\n Driving","summary":" The field of autonomous driving increasingly demands high-quality annotated\ntraining data. In this paper, we propose Panacea, an innovative approach to\ngenerate panoramic and controllable videos in driving scenarios, capable of\nyielding an unlimited numbers of diverse, annotated samples pivotal for\nautonomous driving advancements. Panacea addresses two critical challenges:\n'Consistency' and 'Controllability.' Consistency ensures temporal and\ncross-view coherence, while Controllability ensures the alignment of generated\ncontent with corresponding annotations. Our approach integrates a novel 4D\nattention and a two-stage generation pipeline to maintain coherence,\nsupplemented by the ControlNet framework for meticulous control by the\nBird's-Eye-View (BEV) layouts. Extensive qualitative and quantitative\nevaluations of Panacea on the nuScenes dataset prove its effectiveness in\ngenerating high-quality multi-view driving-scene videos. This work notably\npropels the field of autonomous driving by effectively augmenting the training\ndataset used for advanced BEV perception techniques.\n","authors":["Yuqing Wen","Yucheng Zhao","Yingfei Liu","Fan Jia","Yanhui Wang","Chong Luo","Chi Zhang","Tiancai Wang","Xiaoyan Sun","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.16813v1.pdf","comment":"Project page: https://panacea-ad.github.io/"},{"id":"http://arxiv.org/abs/2311.06542v2","updated":"2023-11-28T13:50:12Z","published":"2023-11-11T11:35:37Z","title":"Generation Of Colors using Bidirectional Long Short Term Memory Networks","summary":" Human vision can distinguish between a vast spectrum of colours, estimated to\nbe between 2 to 7 million discernible shades. However, this impressive range\ndoes not inherently imply that all these colours have been precisely named and\ndescribed within our lexicon. We often associate colours with familiar objects\nand concepts in our daily lives. This research endeavors to bridge the gap\nbetween our visual perception of countless shades and our ability to articulate\nand name them accurately. A novel model has been developed to achieve this\ngoal, leveraging Bidirectional Long Short-Term Memory (BiLSTM) networks with\nActive learning. This model operates on a proprietary dataset meticulously\ncurated for this study. The primary objective of this research is to create a\nversatile tool for categorizing and naming previously unnamed colours or\nidentifying intermediate shades that elude traditional colour terminology. The\nfindings underscore the potential of this innovative approach in\nrevolutionizing our understanding of colour perception and language. Through\nrigorous experimentation and analysis, this study illuminates a promising\navenue for Natural Language Processing (NLP) applications in diverse\nindustries. By facilitating the exploration of the vast colour spectrum the\npotential applications of NLP are extended beyond conventional boundaries.\n","authors":["A. Sinha"],"pdf_url":"https://arxiv.org/pdf/2311.06542v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2311.16782v1","updated":"2023-11-28T13:45:15Z","published":"2023-11-28T13:45:15Z","title":"The curse of language biases in remote sensing VQA: the role of spatial\n attributes, language diversity, and the need for clear evaluation","summary":" Remote sensing visual question answering (RSVQA) opens new opportunities for\nthe use of overhead imagery by the general public, by enabling human-machine\ninteraction with natural language. Building on the recent advances in natural\nlanguage processing and computer vision, the goal of RSVQA is to answer a\nquestion formulated in natural language about a remote sensing image. Language\nunderstanding is essential to the success of the task, but has not yet been\nthoroughly examined in RSVQA. In particular, the problem of language biases is\noften overlooked in the remote sensing community, which can impact model\nrobustness and lead to wrong conclusions about the performances of the model.\nThus, the present work aims at highlighting the problem of language biases in\nRSVQA with a threefold analysis strategy: visual blind models, adversarial\ntesting and dataset analysis. This analysis focuses both on model and data.\nMoreover, we motivate the use of more informative and complementary evaluation\nmetrics sensitive to the issue. The gravity of language biases in RSVQA is then\nexposed for all of these methods with the training of models discarding the\nimage data and the manipulation of the visual input during inference. Finally,\na detailed analysis of question-answer distribution demonstrates the root of\nthe problem in the data itself. Thanks to this analytical study, we observed\nthat biases in remote sensing are more severe than in standard VQA, likely due\nto the specifics of existing remote sensing datasets for the task, e.g.\ngeographical similarities and sparsity, as well as a simpler vocabulary and\nquestion generation strategies. While new, improved and less-biased datasets\nappear as a necessity for the development of the promising field of RSVQA, we\ndemonstrate that more informed, relative evaluation metrics remain much needed\nto transparently communicate results of future RSVQA methods.\n","authors":["Christel Chappuis","Eliot Walt","Vincent Mendez","Sylvain Lobry","Bertrand Le Saux","Devis Tuia"],"pdf_url":"https://arxiv.org/pdf/2311.16782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01837v2","updated":"2023-11-28T13:36:58Z","published":"2023-10-03T07:01:23Z","title":"Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation","summary":" Current AI-based methods do not provide comprehensible physical\ninterpretations of the utilized data, extracted features, and\npredictions/inference operations. As a result, deep learning models trained\nusing high-resolution satellite imagery lack transparency and explainability\nand can be merely seen as a black box, which limits their wide-level adoption.\nExperts need help understanding the complex behavior of AI models and the\nunderlying decision-making process. The explainable artificial intelligence\n(XAI) field is an emerging field providing means for robust, practical, and\ntrustworthy deployment of AI models. Several XAI techniques have been proposed\nfor image classification tasks, whereas the interpretation of image\nsegmentation remains largely unexplored. This paper offers to bridge this gap\nby adapting the recent XAI classification algorithms and making them usable for\nmuti-class image segmentation, where we mainly focus on buildings' segmentation\nfrom high-resolution satellite images. To benchmark and compare the performance\nof the proposed approaches, we introduce a new XAI evaluation methodology and\nmetric based on \"Entropy\" to measure the model uncertainty. Conventional XAI\nevaluation methods rely mainly on feeding area-of-interest regions from the\nimage back to the pre-trained (utility) model and then calculating the average\nchange in the probability of the target class. Those evaluation metrics lack\nthe needed robustness, and we show that using Entropy to monitor the model\nuncertainty in segmenting the pixels within the target class is more suitable.\nWe hope this work will pave the way for additional XAI research for image\nsegmentation and applications in the remote sensing discipline.\n","authors":["Abdul Karim Gizzini","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16773v1","updated":"2023-11-28T13:30:10Z","published":"2023-11-28T13:30:10Z","title":"Multi-Channel Cross Modal Detection of Synthetic Face Images","summary":" Synthetically generated face images have shown to be indistinguishable from\nreal images by humans and as such can lead to a lack of trust in digital\ncontent as they can, for instance, be used to spread misinformation. Therefore,\nthe need to develop algorithms for detecting entirely synthetic face images is\napparent. Of interest are images generated by state-of-the-art deep\nlearning-based models, as these exhibit a high level of visual realism. Recent\nworks have demonstrated that detecting such synthetic face images under\nrealistic circumstances remains difficult as new and improved generative models\nare proposed with rapid speed and arbitrary image post-processing can be\napplied. In this work, we propose a multi-channel architecture for detecting\nentirely synthetic face images which analyses information both in the frequency\nand visible spectra using Cross Modal Focal Loss. We compare the proposed\narchitecture with several related architectures trained using Binary Cross\nEntropy and show in cross-model experiments that the proposed architecture\nsupervised using Cross Modal Focal Loss, in general, achieves most competitive\nperformance.\n","authors":["M. Ibsen","C. Rathgeb","S. Marcel","C. Busch"],"pdf_url":"https://arxiv.org/pdf/2311.16773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14289v2","updated":"2023-11-28T13:28:24Z","published":"2023-09-25T16:52:59Z","title":"CLIP-DIY: CLIP Dense Inference Yields Open-Vocabulary Semantic\n Segmentation For-Free","summary":" The emergence of CLIP has opened the way for open-world image perception. The\nzero-shot classification capabilities of the model are impressive but are\nharder to use for dense tasks such as image segmentation. Several methods have\nproposed different modifications and learning schemes to produce dense output.\nInstead, we propose in this work an open-vocabulary semantic segmentation\nmethod, dubbed CLIP-DIY, which does not require any additional training or\nannotations, but instead leverages existing unsupervised object localization\napproaches. In particular, CLIP-DIY is a multi-scale approach that directly\nexploits CLIP classification abilities on patches of different sizes and\naggregates the decision in a single map. We further guide the segmentation\nusing foreground/background scores obtained using unsupervised object\nlocalization methods. With our method, we obtain state-of-the-art zero-shot\nsemantic segmentation results on PASCAL VOC and perform on par with the best\nmethods on COCO. The code is available at\nhttp://github.com/wysoczanska/clip-diy\n","authors":["Monika Wysoczańska","Michaël Ramamonjisoa","Tomasz Trzciński","Oriane Siméoni"],"pdf_url":"https://arxiv.org/pdf/2309.14289v2.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2311.16766v1","updated":"2023-11-28T13:14:55Z","published":"2023-11-28T13:14:55Z","title":"Rescuing referral failures during automated diagnosis of domain-shifted\n medical images","summary":" The success of deep learning models deployed in the real world depends\ncritically on their ability to generalize well across diverse data domains.\nHere, we address a fundamental challenge with selective classification during\nautomated diagnosis with domain-shifted medical images. In this scenario,\nmodels must learn to avoid making predictions when label confidence is low,\nespecially when tested with samples far removed from the training set\n(covariate shift). Such uncertain cases are typically referred to the clinician\nfor further analysis and evaluation. Yet, we show that even state-of-the-art\ndomain generalization approaches fail severely during referral when tested on\nmedical images acquired from a different demographic or using a different\ntechnology. We examine two benchmark diagnostic medical imaging datasets\nexhibiting strong covariate shifts: i) diabetic retinopathy prediction with\nretinal fundus images and ii) multilabel disease prediction with chest X-ray\nimages. We show that predictive uncertainty estimates do not generalize well\nunder covariate shifts leading to non-monotonic referral curves, and severe\ndrops in performance (up to 50%) at high referral rates (>70%). We evaluate\nnovel combinations of robust generalization and post hoc referral approaches,\nthat rescue these failures and achieve significant performance improvements,\ntypically >10%, over baseline methods. Our study identifies a critical\nchallenge with referral in domain-shifted medical images and finds key\napplications in reliable, automated disease diagnosis.\n","authors":["Anuj Srivastava","Karm Patel","Pradeep Shenoy","Devarajan Sridharan"],"pdf_url":"https://arxiv.org/pdf/2311.16766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00126v2","updated":"2023-11-28T13:12:39Z","published":"2023-04-28T23:43:10Z","title":"Event-Free Moving Object Segmentation from Moving Ego Vehicle","summary":" Moving object segmentation (MOS) in dynamic scenes is challenging for\nautonomous driving, especially for sequences obtained from moving ego vehicles.\nMost state-of-the-art methods leverage motion cues obtained from optical flow\nmaps. However, since these methods are often based on optical flows that are\npre-computed from successive RGB frames, this neglects the temporal\nconsideration of events occurring within inter-frame and limits the\npracticality of these methods in real-life situations. To address these\nlimitations, we propose to exploit event cameras for better video\nunderstanding, which provide rich motion cues without relying on optical flow.\nTo foster research in this area, we first introduce a novel large-scale dataset\ncalled DSEC-MOS for moving object segmentation from moving ego vehicles.\nSubsequently, we devise EmoFormer, a novel network able to exploit the event\ndata. For this purpose, we fuse the event prior with spatial semantic maps to\ndistinguish moving objects from the static background, adding another level of\ndense supervision around our object of interest - moving ones. Our proposed\nnetwork relies only on event data for training but does not require event input\nduring inference, making it directly comparable to frame-only methods in terms\nof efficiency and more widely usable in many application cases. An exhaustive\ncomparison with 8 state-of-the-art video object segmentation methods highlights\na significant performance improvement of our method over all other methods.\nProject Page: https://github.com/ZZY-Zhou/DSEC-MOS.\n","authors":["Zhuyun Zhou","Zongwei Wu","Danda Pani Paudel","Rémi Boutteau","Fan Yang","Luc Van Gool","Radu Timofte","Dominique Ginhac"],"pdf_url":"https://arxiv.org/pdf/2305.00126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10276v3","updated":"2023-11-28T13:07:03Z","published":"2023-03-17T23:18:20Z","title":"Unleashing the Potential of Spiking Neural Networks by Dynamic\n Confidence","summary":" This paper presents a new methodology to alleviate the fundamental trade-off\nbetween accuracy and latency in spiking neural networks (SNNs). The approach\ninvolves decoding confidence information over time from the SNN outputs and\nusing it to develop a decision-making agent that can dynamically determine when\nto terminate each inference.\n The proposed method, Dynamic Confidence, provides several significant\nbenefits to SNNs. 1. It can effectively optimize latency dynamically at\nruntime, setting it apart from many existing low-latency SNN algorithms. Our\nexperiments on CIFAR-10 and ImageNet datasets have demonstrated an average 40%\nspeedup across eight different settings after applying Dynamic Confidence. 2.\nThe decision-making agent in Dynamic Confidence is straightforward to construct\nand highly robust in parameter space, making it extremely easy to implement. 3.\nThe proposed method enables visualizing the potential of any given SNN, which\nsets a target for current SNNs to approach. For instance, if an SNN can\nterminate at the most appropriate time point for each input sample, a ResNet-50\nSNN can achieve an accuracy as high as 82.47% on ImageNet within just 4.71 time\nsteps on average. Unlocking the potential of SNNs needs a highly-reliable\ndecision-making agent to be constructed and fed with a high-quality estimation\nof ground truth. In this regard, Dynamic Confidence represents a meaningful\nstep toward realizing the potential of SNNs.\n","authors":["Chen Li","Edward Jones","Steve Furber"],"pdf_url":"https://arxiv.org/pdf/2303.10276v3.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2311.15243v2","updated":"2023-11-28T13:06:43Z","published":"2023-11-26T09:06:40Z","title":"ID-like Prompt Learning for Few-Shot Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection methods often exploit auxiliary outliers\nto train model identifying OOD samples, especially discovering challenging\noutliers from auxiliary outliers dataset to improve OOD detection. However,\nthey may still face limitations in effectively distinguishing between the most\nchallenging OOD samples that are much like in-distribution (ID) data, i.e.,\nID-like samples. To this end, we propose a novel OOD detection framework that\ndiscovers ID-like outliers using CLIP from the vicinity space of the ID\nsamples, thus helping to identify these most challenging OOD samples. Then a\nprompt learning framework is proposed that utilizes the identified ID-like\noutliers to further leverage the capabilities of CLIP for OOD detection.\nBenefiting from the powerful CLIP, we only need a small number of ID samples to\nlearn the prompts of the model without exposing other auxiliary outlier\ndatasets. By focusing on the most challenging ID-like OOD samples and elegantly\nexploiting the capabilities of CLIP, our method achieves superior few-shot\nlearning performance on various real-world image datasets (e.g., in 4-shot OOD\ndetection on the ImageNet-1k dataset, our method reduces the average FPR95 by\n12.16% and improves the average AUROC by 2.76%, compared to state-of-the-art\nmethods).\n","authors":["Yichen Bai","Zongbo Han","Changqing Zhang","Bing Cao","Xiaoheng Jiang","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2311.15243v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.16759v1","updated":"2023-11-28T13:02:33Z","published":"2023-11-28T13:02:33Z","title":"Gradient-based Local Next-best-view Planning for Improved Perception of\n Targeted Plant Nodes","summary":" Robots are increasingly used in tomato greenhouses to automate\nlabour-intensive tasks such as selective harvesting and de-leafing. To perform\nthese tasks, robots must be able to accurately and efficiently perceive the\nplant nodes that need to be cut, despite the high levels of occlusion from\nother plant parts. We formulate this problem as a local next-best-view (NBV)\nplanning task where the robot has to plan an efficient set of camera viewpoints\nto overcome occlusion and improve the quality of perception. Our formulation\nfocuses on quickly improving the perception accuracy of a single target node to\nmaximise its chances of being cut. Previous methods of NBV planning mostly\nfocused on global view planning and used random sampling of candidate\nviewpoints for exploration, which could suffer from high computational costs,\nineffective view selection due to poor candidates, or non-smooth trajectories\ndue to inefficient sampling. We propose a gradient-based NBV planner using\ndifferential ray sampling, which directly estimates the local gradient\ndirection for viewpoint planning to overcome occlusion and improve perception.\nThrough simulation experiments, we showed that our planner can handle\nocclusions and improve the 3D reconstruction and position estimation of nodes\nequally well as a sampling-based NBV planner, while taking ten times less\ncomputation and generating 28% more efficient trajectories.\n","authors":["Akshay K. Burusa","Eldert J. van Henten","Gert Kootstra"],"pdf_url":"https://arxiv.org/pdf/2311.16759v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2311.16754v1","updated":"2023-11-28T12:52:49Z","published":"2023-11-28T12:52:49Z","title":"Towards Full-scene Domain Generalization in Multi-agent Collaborative\n Bird's Eye View Segmentation for Connected and Autonomous Driving","summary":" Collaborative perception has recently gained significant attention in\nautonomous driving, improving perception quality by enabling the exchange of\nadditional information among vehicles. However, deploying collaborative\nperception systems can lead to domain shifts due to diverse environmental\nconditions and data heterogeneity among connected and autonomous vehicles\n(CAVs). To address these challenges, we propose a unified domain generalization\nframework applicable in both training and inference stages of collaborative\nperception. In the training phase, we introduce an Amplitude Augmentation\n(AmpAug) method to augment low-frequency image variations, broadening the\nmodel's ability to learn across various domains. We also employ a\nmeta-consistency training scheme to simulate domain shifts, optimizing the\nmodel with a carefully designed consistency loss to encourage domain-invariant\nrepresentations. In the inference phase, we introduce an intra-system domain\nalignment mechanism to reduce or potentially eliminate the domain discrepancy\namong CAVs prior to inference. Comprehensive experiments substantiate the\neffectiveness of our method in comparison with the existing state-of-the-art\nworks. Code will be released at https://github.com/DG-CAVs/DG-CoPerception.git.\n","authors":["Senkang Hu","Zhengru Fang","Xianhao Chen","Yuguang Fang","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2311.16754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16739v1","updated":"2023-11-28T12:35:13Z","published":"2023-11-28T12:35:13Z","title":"As-Plausible-As-Possible: Plausibility-Aware Mesh Deformation Using 2D\n Diffusion Priors","summary":" We present As-Plausible-as-Possible (APAP) mesh deformation technique that\nleverages 2D diffusion priors to preserve the plausibility of a mesh under\nuser-controlled deformation. Our framework uses per-face Jacobians to represent\nmesh deformations, where mesh vertex coordinates are computed via a\ndifferentiable Poisson Solve. The deformed mesh is rendered, and the resulting\n2D image is used in the Score Distillation Sampling (SDS) process, which\nenables extracting meaningful plausibility priors from a pretrained 2D\ndiffusion model. To better preserve the identity of the edited mesh, we\nfine-tune our 2D diffusion model with LoRA. Gradients extracted by SDS and a\nuser-prescribed handle displacement are then backpropagated to the per-face\nJacobians, and we use iterative gradient descent to compute the final\ndeformation that balances between the user edit and the output plausibility. We\nevaluate our method with 2D and 3D meshes and demonstrate qualitative and\nquantitative improvements when using plausibility priors over\ngeometry-preservation or distortion-minimization priors used by previous\ntechniques.\n","authors":["Seungwoo Yoo","Kunho Kim","Vladimir G. Kim","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2311.16739v1.pdf","comment":"Project page: https://as-plausible-as-possible.github.io/"},{"id":"http://arxiv.org/abs/2311.16738v1","updated":"2023-11-28T12:34:46Z","published":"2023-11-28T12:34:46Z","title":"Riemannian Self-Attention Mechanism for SPD Networks","summary":" Symmetric positive definite (SPD) matrix has been demonstrated to be an\neffective feature descriptor in many scientific areas, as it can encode\nspatiotemporal statistics of the data adequately on a curved Riemannian\nmanifold, i.e., SPD manifold. Although there are many different ways to design\nnetwork architectures for SPD matrix nonlinear learning, very few solutions\nexplicitly mine the geometrical dependencies of features at different layers.\nMotivated by the great success of self-attention mechanism in capturing\nlong-range relationships, an SPD manifold self-attention mechanism (SMSA) is\nproposed in this paper using some manifold-valued geometric operations, mainly\nthe Riemannian metric, Riemannian mean, and Riemannian optimization. Then, an\nSMSA-based geometric learning module (SMSA-GLM) is designed for the sake of\nimproving the discrimination of the generated deep structured representations.\nExtensive experimental results achieved on three benchmarking datasets show\nthat our modification against the baseline network further alleviates the\ninformation degradation problem and leads to improved accuracy.\n","authors":["Rui Wang","Xiao-Jun Wu","Hui Li","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2311.16738v1.pdf","comment":"14 pages, 10 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.16737v1","updated":"2023-11-28T12:33:49Z","published":"2023-11-28T12:33:49Z","title":"Point'n Move: Interactive Scene Object Manipulation on Gaussian\n Splatting Radiance Fields","summary":" We propose Point'n Move, a method that achieves interactive scene object\nmanipulation with exposed region inpainting. Interactivity here further comes\nfrom intuitive object selection and real-time editing. To achieve this, we\nadopt Gaussian Splatting Radiance Field as the scene representation and fully\nleverage its explicit nature and speed advantage. Its explicit representation\nformulation allows us to devise a 2D prompt points to 3D mask dual-stage\nself-prompting segmentation algorithm, perform mask refinement and merging,\nminimize change as well as provide good initialization for scene inpainting and\nperform editing in real-time without per-editing training, all leads to\nsuperior quality and performance. We test our method by performing editing on\nboth forward-facing and 360 scenes. We also compare our method against existing\nscene object removal methods, showing superior quality despite being more\ncapable and having a speed advantage.\n","authors":["Jiajun Huang","Hongchuan Yu"],"pdf_url":"https://arxiv.org/pdf/2311.16737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16728v1","updated":"2023-11-28T12:19:00Z","published":"2023-11-28T12:19:00Z","title":"Photo-SLAM: Real-time Simultaneous Localization and Photorealistic\n Mapping for Monocular, Stereo, and RGB-D Cameras","summary":" The integration of neural rendering and the SLAM system recently showed\npromising results in joint localization and photorealistic view reconstruction.\nHowever, existing methods, fully relying on implicit representations, are so\nresource-hungry that they cannot run on portable devices, which deviates from\nthe original intention of SLAM. In this paper, we present Photo-SLAM, a novel\nSLAM framework with a hyper primitives map. Specifically, we simultaneously\nexploit explicit geometric features for localization and learn implicit\nphotometric features to represent the texture information of the observed\nenvironment. In addition to actively densifying hyper primitives based on\ngeometric features, we further introduce a Gaussian-Pyramid-based training\nmethod to progressively learn multi-level features, enhancing photorealistic\nmapping performance. The extensive experiments with monocular, stereo, and\nRGB-D datasets prove that our proposed system Photo-SLAM significantly\noutperforms current state-of-the-art SLAM systems for online photorealistic\nmapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times\nfaster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time\nspeed using an embedded platform such as Jetson AGX Orin, showing the potential\nof robotics applications.\n","authors":["Huajian Huang","Longwei Li","Hui Cheng","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.16728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13409v2","updated":"2023-11-28T12:12:46Z","published":"2023-11-22T14:13:27Z","title":"CompenHR: Efficient Full Compensation for High-resolution Projector","summary":" Full projector compensation is a practical task of projector-camera systems.\nIt aims to find a projector input image, named compensation image, such that\nwhen projected it cancels the geometric and photometric distortions due to the\nphysical environment and hardware. State-of-the-art methods use deep learning\nto address this problem and show promising performance for low-resolution\nsetups. However, directly applying deep learning to high-resolution setups is\nimpractical due to the long training time and high memory cost. To address this\nissue, this paper proposes a practical full compensation solution. Firstly, we\ndesign an attention-based grid refinement network to improve geometric\ncorrection quality. Secondly, we integrate a novel sampling scheme into an\nend-to-end compensation network to alleviate computation and introduce\nattention blocks to preserve key features. Finally, we construct a benchmark\ndataset for high-resolution projector full compensation. In experiments, our\nmethod demonstrates clear advantages in both efficiency and quality.\n","authors":["Yuxi Wang","Haibin Ling","Bingyao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.13409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16714v1","updated":"2023-11-28T11:53:56Z","published":"2023-11-28T11:53:56Z","title":"Embodied Multi-Modal Agent trained by an LLM from a Parallel TextWorld","summary":" While large language models (LLMs) excel in a simulated world of texts, they\nstruggle to interact with the more realistic world without perceptions of other\nmodalities such as visual or audio signals. Although vision-language models\n(VLMs) integrate LLM modules (1) aligned with static image features, and (2)\nmay possess prior knowledge of world dynamics (as demonstrated in the text\nworld), they have not been trained in an embodied visual world and thus cannot\nalign with its dynamics. On the other hand, training an embodied agent in a\nnoisy visual world without expert guidance is often challenging and\ninefficient. In this paper, we train a VLM agent living in a visual world using\nan LLM agent excelling in a parallel text world (but inapplicable to the visual\nworld). Specifically, we distill LLM's reflection outcomes (improved actions by\nanalyzing mistakes) in a text world's tasks to finetune the VLM on the same\ntasks of the visual world, resulting in an Embodied Multi-Modal Agent (EMMA)\nquickly adapting to the visual world dynamics. Such cross-modality imitation\nlearning between the two parallel worlds enables EMMA to generalize to a broad\nscope of new tasks without any further guidance from the LLM expert. Extensive\nevaluations on the ALFWorld benchmark highlight EMMA's superior performance to\nSOTA VLM-based agents across diverse tasks, e.g., 20%-70% improvement in the\nsuccess rate.\n","authors":["Yijun Yang","Tianyi Zhou","Kanxue Li","Dapeng Tao","Lusong Li","Li Shen","Xiaodong He","Jing Jiang","Yuhui Shi"],"pdf_url":"https://arxiv.org/pdf/2311.16714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16711v1","updated":"2023-11-28T11:45:35Z","published":"2023-11-28T11:45:35Z","title":"LEDITS++: Limitless Image Editing using Text-to-Image Models","summary":" Text-to-image diffusion models have recently received increasing interest for\ntheir astonishing ability to produce high-fidelity images from solely text\ninputs. Subsequent research efforts aim to exploit and apply their capabilities\nto real image editing. However, existing image-to-image methods are often\ninefficient, imprecise, and of limited versatility. They either require\ntime-consuming fine-tuning, deviate unnecessarily strongly from the input\nimage, and/or lack support for multiple, simultaneous edits. To address these\nissues, we introduce LEDITS++, an efficient yet versatile and precise textual\nimage manipulation technique. LEDITS++'s novel inversion approach requires no\ntuning nor optimization and produces high-fidelity results with a few diffusion\nsteps. Second, our methodology supports multiple simultaneous edits and is\narchitecture-agnostic. Third, we use a novel implicit masking technique that\nlimits changes to relevant image regions. We propose the novel TEdBench++\nbenchmark as part of our exhaustive evaluation. Our results demonstrate the\ncapabilities of LEDITS++ and its improvements over previous methods. The\nproject page is available at https://leditsplusplus-project.static.hf.space .\n","authors":["Manuel Brack","Felix Friedrich","Katharina Kornmeier","Linoy Tsaban","Patrick Schramowski","Kristian Kersting","Apolinário Passos"],"pdf_url":"https://arxiv.org/pdf/2311.16711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02760v3","updated":"2023-11-28T11:44:16Z","published":"2022-11-04T21:51:53Z","title":"Development and evaluation of automated localisation and reconstruction\n of all fruits on tomato plants in a greenhouse based on multi-view perception\n and 3D multi-object tracking","summary":" The ability to accurately represent and localise relevant objects is\nessential for robots to carry out tasks effectively. Traditional approaches,\nwhere robots simply capture an image, process that image to take an action, and\nthen forget the information, have proven to struggle in the presence of\nocclusions. Methods using multi-view perception, which have the potential to\naddress some of these problems, require a world model that guides the\ncollection, integration and extraction of information from multiple viewpoints.\nFurthermore, constructing a generic representation that can be applied in\nvarious environments and tasks is a difficult challenge. In this paper, a novel\napproach for building generic representations in occluded agro-food\nenvironments using multi-view perception and 3D multi-object tracking is\nintroduced. The method is based on a detection algorithm that generates partial\npoint clouds for each detected object, followed by a 3D multi-object tracking\nalgorithm that updates the representation over time. The accuracy of the\nrepresentation was evaluated in a real-world environment, where successful\nrepresentation and localisation of tomatoes in tomato plants were achieved,\ndespite high levels of occlusion, with the total count of tomatoes estimated\nwith a maximum error of 5.08% and the tomatoes tracked with an accuracy up to\n71.47%. Novel tracking metrics were introduced, demonstrating that valuable\ninsight into the errors in localising and representing the fruits can be\nprovided by their use. This approach presents a novel solution for building\nrepresentations in occluded agro-food environments, demonstrating potential to\nenable robots to perform tasks effectively in these challenging environments.\n","authors":["David Rapado Rincon","Eldert J. van Henten","Gert Kootstra"],"pdf_url":"https://arxiv.org/pdf/2211.02760v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09554v4","updated":"2023-11-28T11:44:10Z","published":"2023-02-19T12:18:45Z","title":"Mixed Hierarchy Network for Image Restoration","summary":" Image restoration is a long-standing low-level vision problem, e.g.,\ndeblurring and deraining. In the process of image restoration, it is necessary\nto consider not only the spatial details and contextual information of\nrestoration to ensure the quality, but also the system complexity. Although\nmany methods have been able to guarantee the quality of image restoration, the\nsystem complexity of the state-of-the-art (SOTA) methods is increasing as well.\nMotivated by this, we present a mixed hierarchy network that can balance these\ncompeting goals. Our main proposal is a mixed hierarchy architecture, that\nprogressively recovers contextual information and spatial details from degraded\nimages while we design intra-blocks to reduce system complexity. Specifically,\nour model first learns the contextual information using encoder-decoder\narchitectures, and then combines them with high-resolution branches that\npreserve spatial detail. In order to reduce the system complexity of this\narchitecture for convenient analysis and comparison, we replace or remove the\nnonlinear activation function with multiplication and use a simple network\nstructure. In addition, we replace spatial convolution with global\nself-attention for the middle block of encoder-decoder. The resulting tightly\ninterlinked hierarchy architecture, named as MHNet, delivers strong performance\ngains on several image restoration tasks, including image deraining, and\ndeblurring.\n","authors":["Hu Gao","Depeng Dang"],"pdf_url":"https://arxiv.org/pdf/2302.09554v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16707v1","updated":"2023-11-28T11:32:23Z","published":"2023-11-28T11:32:23Z","title":"Full-resolution MLPs Empower Medical Dense Prediction","summary":" Dense prediction is a fundamental requirement for many medical vision tasks\nsuch as medical image restoration, registration, and segmentation. The most\npopular vision model, Convolutional Neural Networks (CNNs), has reached\nbottlenecks due to the intrinsic locality of convolution operations. Recently,\ntransformers have been widely adopted for dense prediction for their capability\nto capture long-range visual dependence. However, due to the high computational\ncomplexity and large memory consumption of self-attention operations,\ntransformers are usually used at downsampled feature resolutions. Such usage\ncannot effectively leverage the tissue-level textural information available\nonly at the full image resolution. This textural information is crucial for\nmedical dense prediction as it can differentiate the subtle human anatomy in\nmedical images. In this study, we hypothesize that Multi-layer Perceptrons\n(MLPs) are superior alternatives to transformers in medical dense prediction\nwhere tissue-level details dominate the performance, as MLPs enable long-range\ndependence at the full image resolution. To validate our hypothesis, we develop\na full-resolution hierarchical MLP framework that uses MLPs beginning from the\nfull image resolution. We evaluate this framework with various MLP blocks on a\nwide range of medical dense prediction tasks including restoration,\nregistration, and segmentation. Extensive experiments on six public\nwell-benchmarked datasets show that, by simply using MLPs at full resolution,\nour framework outperforms its CNN and transformer counterparts and achieves\nstate-of-the-art performance on various medical dense prediction tasks.\n","authors":["Mingyuan Meng","Yuxin Xue","Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2311.16707v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2311.16703v1","updated":"2023-11-28T11:27:48Z","published":"2023-11-28T11:27:48Z","title":"CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD\n Programs","summary":" CAD programs are a popular way to compactly encode shapes as a sequence of\noperations that are easy to parametrically modify. However, without sufficient\nsemantic comments and structure, such programs can be challenging to\nunderstand, let alone modify. We introduce the problem of semantic commenting\nCAD programs, wherein the goal is to segment the input program into code blocks\ncorresponding to semantically meaningful shape parts and assign a semantic\nlabel to each block. We solve the problem by combining program parsing with\nvisual-semantic analysis afforded by recent advances in foundational language\nand vision models. Specifically, by executing the input programs, we create\nshapes, which we use to generate conditional photorealistic images to make use\nof semantic annotators for such images. We then distill the information across\nthe images and link back to the original programs to semantically comment on\nthem. Additionally, we collected and annotated a benchmark dataset, CADTalk,\nconsisting of 5,280 machine-made programs and 45 human-made programs with\nground truth semantic comments to foster future research. We extensively\nevaluated our approach, compared to a GPT-based baseline approach, and an\nopen-set shape segmentation baseline, i.e., PartSLIP, and reported an 83.24%\naccuracy on the new CADTalk dataset. Project page:\nhttps://enigma-li.github.io/CADTalk/.\n","authors":["Haocheng Yuan","Jing Xu","Hao Pan","Adrien Bousseau","Niloy Mitra","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2311.16703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14875v2","updated":"2023-11-28T11:27:27Z","published":"2023-11-24T23:54:33Z","title":"Uncertainty Aware AI for 2D MRI Segmentation","summary":" Robust uncertainty estimations are necessary in safety-critical applications\nof Deep Learning. One such example is the semantic segmentation of medical\nimages, whilst deep-learning approaches have high performance in such tasks\nthey lack interpretability as they give no indication of their confidence when\nmaking classification decisions. Robust and interpretable segmentation is a\ncritical first stage in automatically screening for pathologies hence the\noptimal solution is one which can provide high accuracy but also capture the\nunderlying uncertainty. In this work, we present an uncertainty-aware\nsegmentation model, BA U-Net, for use on MRI data that incorporates Bayesian\nNeural Networks and Attention Mechanisms to provide accurate and interpretable\nsegmentations. We evaluated our model on the publicly available BraTS 2020\ndataset using F1 Score and Intersection Over Union (IoU) as evaluation metrics.\n","authors":["Lohith Konathala"],"pdf_url":"https://arxiv.org/pdf/2311.14875v2.pdf","comment":"14 Pages, 9 Figures Updated to Correct Typos, Revise Title"},{"id":"http://arxiv.org/abs/2310.02071v4","updated":"2023-11-28T11:23:14Z","published":"2023-10-03T14:13:36Z","title":"Towards End-to-End Embodied Decision Making via Multi-modal Large\n Language Model: Explorations with GPT4-Vision and Beyond","summary":" In this study, we explore the potential of Multimodal Large Language Models\n(MLLMs) in improving embodied decision-making processes for agents. While Large\nLanguage Models (LLMs) have been widely used due to their advanced reasoning\nskills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual\nunderstanding and reasoning capabilities. We investigate whether\nstate-of-the-art MLLMs can handle embodied decision-making in an end-to-end\nmanner and whether collaborations between LLMs and MLLMs can enhance\ndecision-making. To address these questions, we introduce a new benchmark\ncalled PCA-EVAL, which evaluates embodied decision-making from the perspectives\nof Perception, Cognition, and Action. Additionally, we propose HOLMES, a\nmulti-agent cooperation framework that allows LLMs to leverage MLLMs and APIs\nto gather multimodal information for informed decision-making. We compare\nend-to-end embodied decision-making and HOLMES on our benchmark and find that\nthe GPT4-Vision model demonstrates strong end-to-end embodied decision-making\nabilities, outperforming GPT4-HOLMES in terms of average decision accuracy\n(+3%). However, this performance is exclusive to the latest GPT4-Vision model,\nsurpassing the open-source state-of-the-art MLLM by 26%. Our results indicate\nthat powerful MLLMs like GPT4-Vision hold promise for decision-making in\nembodied agents, offering new avenues for MLLM research. Code and data are open\nat https://github.com/pkunlp-icler/PCA-EVAL/.\n","authors":["Liang Chen","Yichi Zhang","Shuhuai Ren","Haozhe Zhao","Zefan Cai","Yuchi Wang","Peiyi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2310.02071v4.pdf","comment":"FMDM@NeurIPS2023, Code and data:\n https://github.com/pkunlp-icler/PCA-EVAL/"},{"id":"http://arxiv.org/abs/2311.16700v1","updated":"2023-11-28T11:22:08Z","published":"2023-11-28T11:22:08Z","title":"Rethinking Intermediate Layers design in Knowledge Distillation for\n Kidney and Liver Tumor Segmentation","summary":" Knowledge distillation(KD) has demonstrated remarkable success across various\ndomains, but its application to medical imaging tasks, such as kidney and liver\ntumor segmentation, has encountered challenges. Many existing KD methods are\nnot specifically tailored for these tasks. Moreover, prevalent KD methods often\nlack a careful consideration of what and from where to distill knowledge from\nthe teacher to the student. This oversight may lead to issues like the\naccumulation of training bias within shallower student layers, potentially\ncompromising the effectiveness of KD. To address these challenges, we propose\nHierarchical Layer-selective Feedback Distillation (HLFD). HLFD strategically\ndistills knowledge from a combination of middle layers to earlier layers and\ntransfers final layer knowledge to intermediate layers at both the feature and\npixel levels. This design allows the model to learn higher-quality\nrepresentations from earlier layers, resulting in a robust and compact student\nmodel. Extensive quantitative evaluations reveal that HLFD outperforms existing\nmethods by a significant margin. For example, in the kidney segmentation task,\nHLFD surpasses the student model (without KD) by over 10pp, significantly\nimproving its focus on tumor-specific features. From a qualitative standpoint,\nthe student model trained using HLFD excels at suppressing irrelevant\ninformation and can focus sharply on tumor-specific details, which opens a new\npathway for more efficient and accurate diagnostic tools.\n","authors":["Vandan Gorade","Sparsh Mittal","Debesh Jha","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2311.16700v1.pdf","comment":"Under-review at ISBI-2024"},{"id":"http://arxiv.org/abs/2311.16682v1","updated":"2023-11-28T10:53:55Z","published":"2023-11-28T10:53:55Z","title":"ContextSeg: Sketch Semantic Segmentation by Querying the Context with\n Attention","summary":" Sketch semantic segmentation is a well-explored and pivotal problem in\ncomputer vision involving the assignment of pre-defined part labels to\nindividual strokes. This paper presents ContextSeg - a simple yet highly\neffective approach to tackling this problem with two stages. In the first\nstage, to better encode the shape and positional information of strokes, we\npropose to predict an extra dense distance field in an autoencoder network to\nreinforce structural information learning. In the second stage, we treat an\nentire stroke as a single entity and label a group of strokes within the same\nsemantic part using an auto-regressive Transformer with the default attention\nmechanism. By group-based labeling, our method can fully leverage the context\ninformation when making decisions for the remaining groups of strokes. Our\nmethod achieves the best segmentation accuracy compared with state-of-the-art\napproaches on two representative datasets and has been extensively evaluated\ndemonstrating its superior performance. Additionally, we offer insights into\nsolving part imbalance in training data and the preliminary experiment on\ncross-category training, which can inspire future research in this field.\n","authors":["Jiawei Wang","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2311.16682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16681v1","updated":"2023-11-28T10:53:26Z","published":"2023-11-28T10:53:26Z","title":"Understanding the (Extra-)Ordinary: Validating Deep Model Decisions with\n Prototypical Concept-based Explanations","summary":" Ensuring both transparency and safety is critical when deploying Deep Neural\nNetworks (DNNs) in high-risk applications, such as medicine. The field of\nexplainable AI (XAI) has proposed various methods to comprehend the\ndecision-making processes of opaque DNNs. However, only few XAI methods are\nsuitable of ensuring safety in practice as they heavily rely on repeated\nlabor-intensive and possibly biased human assessment. In this work, we present\na novel post-hoc concept-based XAI framework that conveys besides instance-wise\n(local) also class-wise (global) decision-making strategies via prototypes.\nWhat sets our approach apart is the combination of local and global strategies,\nenabling a clearer understanding of the (dis-)similarities in model decisions\ncompared to the expected (prototypical) concept use, ultimately reducing the\ndependence on human long-term assessment. Quantifying the deviation from\nprototypical behavior not only allows to associate predictions with specific\nmodel sub-strategies but also to detect outlier behavior. As such, our approach\nconstitutes an intuitive and explainable tool for model validation. We\ndemonstrate the effectiveness of our approach in identifying\nout-of-distribution samples, spurious model behavior and data quality issues\nacross three datasets (ImageNet, CUB-200, and CIFAR-10) utilizing VGG, ResNet,\nand EfficientNet architectures. Code is available on\nhttps://github.com/maxdreyer/pcx.\n","authors":["Maximilian Dreyer","Reduan Achtibat","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2311.16681v1.pdf","comment":"37 pages (9 pages manuscript, 2 pages references, 26 pages appendix)"},{"id":"http://arxiv.org/abs/2311.16673v1","updated":"2023-11-28T10:39:19Z","published":"2023-11-28T10:39:19Z","title":"Large Language Models Meet Computer Vision: A Brief Survey","summary":" Recently, the intersection of Large Language Models (LLMs) and Computer\nVision (CV) has emerged as a pivotal area of research, driving significant\nadvancements in the field of Artificial Intelligence (AI). As transformers have\nbecome the backbone of many state-of-the-art models in both Natural Language\nProcessing (NLP) and CV, understanding their evolution and potential\nenhancements is crucial. This survey paper delves into the latest progressions\nin the domain of transformers and their subsequent successors, emphasizing\ntheir potential to revolutionize Vision Transformers (ViTs) and LLMs. This\nsurvey also presents a comparative analysis, juxtaposing the performance\nmetrics of several leading paid and open-source LLMs, shedding light on their\nstrengths and areas of improvement as well as a literature review on how LLMs\nare being used to tackle vision related tasks. Furthermore, the survey presents\na comprehensive collection of datasets employed to train LLMs, offering\ninsights into the diverse data available to achieve high performance in various\npre-training and downstream tasks of LLMs. The survey is concluded by\nhighlighting open directions in the field, suggesting potential venues for\nfuture research and development. This survey aims to underscores the profound\nintersection of LLMs on CV, leading to a new era of integrated and advanced AI\nmodels.\n","authors":["Raby Hamadi"],"pdf_url":"https://arxiv.org/pdf/2311.16673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16671v1","updated":"2023-11-28T10:36:36Z","published":"2023-11-28T10:36:36Z","title":"SplitNeRF: Split Sum Approximation Neural Field for Joint Geometry,\n Illumination, and Material Estimation","summary":" We present a novel approach for digitizing real-world objects by estimating\ntheir geometry, material properties, and environmental lighting from a set of\nposed images with fixed lighting. Our method incorporates into Neural Radiance\nField (NeRF) pipelines the split sum approximation used with image-based\nlighting for real-time physical-based rendering. We propose modeling the\nscene's lighting with a single scene-specific MLP representing pre-integrated\nimage-based lighting at arbitrary resolutions. We achieve accurate modeling of\npre-integrated lighting by exploiting a novel regularizer based on efficient\nMonte Carlo sampling. Additionally, we propose a new method of supervising\nself-occlusion predictions by exploiting a similar regularizer based on Monte\nCarlo sampling. Experimental results demonstrate the efficiency and\neffectiveness of our approach in estimating scene geometry, material\nproperties, and lighting. Our method is capable of attaining state-of-the-art\nrelighting quality after only ${\\sim}1$ hour of training in a single NVIDIA\nA100 GPU.\n","authors":["Jesus Zarzar","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2311.16671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16668v1","updated":"2023-11-28T10:29:39Z","published":"2023-11-28T10:29:39Z","title":"LiveNVS: Neural View Synthesis on Live RGB-D Streams","summary":" Existing real-time RGB-D reconstruction approaches, like Kinect Fusion, lack\nreal-time photo-realistic visualization. This is due to noisy, oversmoothed or\nincomplete geometry and blurry textures which are fused from imperfect depth\nmaps and camera poses. Recent neural rendering methods can overcome many of\nsuch artifacts but are mostly optimized for offline usage, hindering the\nintegration into a live reconstruction pipeline.\n In this paper, we present LiveNVS, a system that allows for neural novel view\nsynthesis on a live RGB-D input stream with very low latency and real-time\nrendering. Based on the RGB-D input stream, novel views are rendered by\nprojecting neural features into the target view via a densely fused depth map\nand aggregating the features in image-space to a target feature map. A\ngeneralizable neural network then translates the target feature map into a\nhigh-quality RGB image. LiveNVS achieves state-of-the-art neural rendering\nquality of unknown scenes during capturing, allowing users to virtually explore\nthe scene and assess reconstruction quality in real-time.\n","authors":["Laura Fink","Darius Rückert","Linus Franke","Joachim Keinert","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2311.16668v1.pdf","comment":"main paper: 8 pages, total number of pages: 15, 13 figures, to be\n published in SIGGRAPH Asia 2023 Conference Papers"},{"id":"http://arxiv.org/abs/2311.16664v1","updated":"2023-11-28T10:25:55Z","published":"2023-11-28T10:25:55Z","title":"DGNR: Density-Guided Neural Point Rendering of Large Driving Scenes","summary":" Despite the recent success of Neural Radiance Field (NeRF), it is still\nchallenging to render large-scale driving scenes with long trajectories,\nparticularly when the rendering quality and efficiency are in high demand.\nExisting methods for such scenes usually involve with spatial warping,\ngeometric supervision from zero-shot normal or depth estimation, or scene\ndivision strategies, where the synthesized views are often blurry or fail to\nmeet the requirement of efficient rendering. To address the above challenges,\nthis paper presents a novel framework that learns a density space from the\nscenes to guide the construction of a point-based renderer, dubbed as DGNR\n(Density-Guided Neural Rendering). In DGNR, geometric priors are no longer\nneeded, which can be intrinsically learned from the density space through\nvolumetric rendering. Specifically, we make use of a differentiable renderer to\nsynthesize images from the neural density features obtained from the learned\ndensity space. A density-based fusion module and geometric regularization are\nproposed to optimize the density space. By conducting experiments on a widely\nused autonomous driving dataset, we have validated the effectiveness of DGNR in\nsynthesizing photorealistic driving scenes and achieving real-time capable\nrendering.\n","authors":["Zhuopeng Li","Chenming Wu","Liangjun Zhang","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.16664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16657v1","updated":"2023-11-28T10:18:16Z","published":"2023-11-28T10:18:16Z","title":"SCALAR-NeRF: SCAlable LARge-scale Neural Radiance Fields for Scene\n Reconstruction","summary":" In this work, we introduce SCALAR-NeRF, a novel framework tailored for\nscalable large-scale neural scene reconstruction. We structure the neural\nrepresentation as an encoder-decoder architecture, where the encoder processes\n3D point coordinates to produce encoded features, and the decoder generates\ngeometric values that include volume densities of signed distances and colors.\nOur approach first trains a coarse global model on the entire image dataset.\nSubsequently, we partition the images into smaller blocks using KMeans with\neach block being modeled by a dedicated local model. We enhance the overlapping\nregions across different blocks by scaling up the bounding boxes of each local\nblock. Notably, the decoder from the global model is shared across distinct\nblocks and therefore promoting alignment in the feature space of local\nencoders. We propose an effective and efficient methodology to fuse the outputs\nfrom these local models to attain the final reconstruction. Employing this\nrefined coarse-to-fine strategy, our method outperforms state-of-the-art NeRF\nmethods and demonstrates scalability for large-scale scene reconstruction. The\ncode will be available on our project page at\nhttps://aibluefisher.github.io/SCALAR-NeRF/\n","authors":["Yu Chen","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2311.16657v1.pdf","comment":"Project Page: https://aibluefisher.github.io/SCALAR-NeRF"},{"id":"http://arxiv.org/abs/2311.14971v2","updated":"2023-11-28T10:08:35Z","published":"2023-11-25T09:08:30Z","title":"Segmentation of diagnostic tissue compartments on whole slide images\n with renal thrombotic microangiopathies (TMAs)","summary":" The thrombotic microangiopathies (TMAs) manifest in renal biopsy histology\nwith a broad spectrum of acute and chronic findings. Precise diagnostic\ncriteria for a renal biopsy diagnosis of TMA are missing. As a first step\ntowards a machine learning- and computer vision-based analysis of wholes slide\nimages from renal biopsies, we trained a segmentation model for the decisive\ndiagnostic kidney tissue compartments artery, arteriole, glomerulus on a set of\nwhole slide images from renal biopsies with TMAs and Mimickers (distinct\ndiseases with a similar nephropathological appearance as TMA like severe benign\nnephrosclerosis, various vasculitides, Bevacizumab-plug glomerulopathy,\narteriolar light chain deposition disease). Our segmentation model combines a\nU-Net-based tissue detection with a Shifted windows-transformer architecture to\nreach excellent segmentation results for even the most severely altered\nglomeruli, arterioles and arteries, even on unseen staining domains from a\ndifferent nephropathology lab. With accurate automatic segmentation of the\ndecisive renal biopsy compartments in human renal vasculopathies, we have laid\nthe foundation for large-scale compartment-specific machine learning and\ncomputer vision analysis of renal biopsy repositories with TMAs.\n","authors":["Huy Q. Vo","Pietro A. Cicalese","Surya Seshan","Syed A. Rizvi","Aneesh Vathul","Gloria Bueno","Anibal Pedraza Dorado","Niels Grabe","Katharina Stolle","Francesco Pesce","Joris J. T. H. Roelofs","Jesper Kers","Vitoantonio Bevilacqua","Nicola Altini","Bernd Schröppel","Dario Roccatello","Antonella Barreca","Savino Sciascia","Chandra Mohan","Hien V. Nguyen","Jan U. Becker"],"pdf_url":"https://arxiv.org/pdf/2311.14971v2.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.16652v1","updated":"2023-11-28T10:05:44Z","published":"2023-11-28T10:05:44Z","title":"Augmenting x-ray single particle imaging reconstruction with\n self-supervised machine learning","summary":" The development of X-ray Free Electron Lasers (XFELs) has opened numerous\nopportunities to probe atomic structure and ultrafast dynamics of various\nmaterials. Single Particle Imaging (SPI) with XFELs enables the investigation\nof biological particles in their natural physiological states with unparalleled\ntemporal resolution, while circumventing the need for cryogenic conditions or\ncrystallization. However, reconstructing real-space structures from\nreciprocal-space x-ray diffraction data is highly challenging due to the\nabsence of phase and orientation information, which is further complicated by\nweak scattering signals and considerable fluctuations in the number of photons\nper pulse. In this work, we present an end-to-end, self-supervised machine\nlearning approach to recover particle orientations and estimate reciprocal\nspace intensities from diffraction images only. Our method demonstrates great\nrobustness under demanding experimental conditions with significantly enhanced\nreconstruction capabilities compared with conventional algorithms, and\nsignifies a paradigm shift in SPI as currently practiced at XFELs.\n","authors":["Zhantao Chen","Cong Wang","Mingye Gao","Chun Hong Yoon","Jana B. Thayer","Joshua J. Turner"],"pdf_url":"https://arxiv.org/pdf/2311.16652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12308v4","updated":"2023-11-28T09:58:15Z","published":"2023-04-24T17:57:15Z","title":"Segment Anything in 3D with NeRFs","summary":" Recently, the Segment Anything Model (SAM) emerged as a powerful vision\nfoundation model which is capable to segment anything in 2D images. This paper\naims to generalize SAM to segment 3D objects. Rather than replicating the data\nacquisition and annotation procedure which is costly in 3D, we design an\nefficient solution, leveraging the Neural Radiance Field (NeRF) as a cheap and\noff-the-shelf prior that connects multi-view 2D images to the 3D space. We\nrefer to the proposed solution as SA3D, for Segment Anything in 3D. It is only\nrequired to provide a manual segmentation prompt (e.g., rough points) for the\ntarget object in a single view, which is used to generate its 2D mask in this\nview with SAM. Next, SA3D alternately performs mask inverse rendering and\ncross-view self-prompting across various views to iteratively complete the 3D\nmask of the target object constructed with voxel grids. The former projects the\n2D mask obtained by SAM in the current view onto 3D mask with guidance of the\ndensity distribution learned by the NeRF; The latter extracts reliable prompts\nautomatically as the input to SAM from the NeRF-rendered 2D mask in another\nview. We show in experiments that SA3D adapts to various scenes and achieves 3D\nsegmentation within minutes. Our research reveals a potential methodology to\nlift the ability of a 2D vision foundation model to 3D, as long as the 2D model\ncan steadily address promptable segmentation across multiple views. Our code is\navailable at https://github.com/Jumpat/SegmentAnythingin3D.\n","authors":["Jiazhong Cen","Zanwei Zhou","Jiemin Fang","Chen Yang","Wei Shen","Lingxi Xie","Dongsheng Jiang","Xiaopeng Zhang","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2304.12308v4.pdf","comment":"NeurIPS 2023. Project page: https://jumpat.github.io/SA3D/"},{"id":"http://arxiv.org/abs/2311.05332v2","updated":"2023-11-28T09:47:57Z","published":"2023-11-09T12:58:37Z","title":"On the Road with GPT-4V(ision): Early Explorations of Visual-Language\n Model on Autonomous Driving","summary":" The pursuit of autonomous driving technology hinges on the sophisticated\nintegration of perception, decision-making, and control systems. Traditional\napproaches, both data-driven and rule-based, have been hindered by their\ninability to grasp the nuance of complex driving environments and the\nintentions of other road users. This has been a significant bottleneck,\nparticularly in the development of common sense reasoning and nuanced scene\nunderstanding necessary for safe and reliable autonomous driving. The advent of\nVisual Language Models (VLM) represents a novel frontier in realizing fully\nautonomous vehicle driving. This report provides an exhaustive evaluation of\nthe latest state-of-the-art VLM, GPT-4V(ision), and its application in\nautonomous driving scenarios. We explore the model's abilities to understand\nand reason about driving scenes, make decisions, and ultimately act in the\ncapacity of a driver. Our comprehensive tests span from basic scene recognition\nto complex causal reasoning and real-time decision-making under varying\nconditions. Our findings reveal that GPT-4V demonstrates superior performance\nin scene understanding and causal reasoning compared to existing autonomous\nsystems. It showcases the potential to handle out-of-distribution scenarios,\nrecognize intentions, and make informed decisions in real driving contexts.\nHowever, challenges remain, particularly in direction discernment, traffic\nlight recognition, vision grounding, and spatial reasoning tasks. These\nlimitations underscore the need for further research and development. Project\nis now available on GitHub for interested parties to access and utilize:\n\\url{https://github.com/PJLab-ADG/GPT4V-AD-Exploration}\n","authors":["Licheng Wen","Xuemeng Yang","Daocheng Fu","Xiaofeng Wang","Pinlong Cai","Xin Li","Tao Ma","Yingxuan Li","Linran Xu","Dengke Shang","Zheng Zhu","Shaoyan Sun","Yeqi Bai","Xinyu Cai","Min Dou","Shuanglu Hu","Botian Shi","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2311.05332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16637v1","updated":"2023-11-28T09:44:01Z","published":"2023-11-28T09:44:01Z","title":"Parallax-Tolerant Image Stitching with Epipolar Displacement Field","summary":" Large parallax image stitching is a challenging task. Existing methods often\nstruggle to maintain both the local and global structures of the image while\nreducing alignment artifacts and warping distortions. In this paper, we propose\na novel approach that utilizes epipolar geometry to establish a warping\ntechnique based on the epipolar displacement field. Initially, the warping rule\nfor pixels in the epipolar geometry is established through the infinite\nhomography. Subsequently, Subsequently, the epipolar displacement field, which\nrepresents the sliding distance of the warped pixel along the epipolar line, is\nformulated by thin plate splines based on the principle of local elastic\ndeformation. The stitching result can be generated by inversely warping the\npixels according to the epipolar displacement field. This method incorporates\nthe epipolar constraints in the warping rule, which ensures high-quality\nalignment and maintains the projectivity of the panorama. Qualitative and\nquantitative comparative experiments demonstrate the competitiveness of the\nproposed method in stitching images large parallax.\n","authors":["Jian Yu","Yi Yu","Feipeng Da"],"pdf_url":"https://arxiv.org/pdf/2311.16637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16635v1","updated":"2023-11-28T09:38:45Z","published":"2023-11-28T09:38:45Z","title":"MotionZero:Exploiting Motion Priors for Zero-shot Text-to-Video\n Generation","summary":" Zero-shot Text-to-Video synthesis generates videos based on prompts without\nany videos. Without motion information from videos, motion priors implied in\nprompts are vital guidance. For example, the prompt \"airplane landing on the\nrunway\" indicates motion priors that the \"airplane\" moves downwards while the\n\"runway\" stays static. Whereas the motion priors are not fully exploited in\nprevious approaches, thus leading to two nontrivial issues: 1) the motion\nvariation pattern remains unaltered and prompt-agnostic for disregarding motion\npriors; 2) the motion control of different objects is inaccurate and entangled\nwithout considering the independent motion priors of different objects. To\ntackle the two issues, we propose a prompt-adaptive and disentangled motion\ncontrol strategy coined as MotionZero, which derives motion priors from prompts\nof different objects by Large-Language-Models and accordingly applies motion\ncontrol of different objects to corresponding regions in disentanglement.\nFurthermore, to facilitate videos with varying degrees of motion amplitude, we\npropose a Motion-Aware Attention scheme which adjusts attention among frames by\nmotion amplitude. Extensive experiments demonstrate that our strategy could\ncorrectly control motion of different objects and support versatile\napplications including zero-shot video edit.\n","authors":["Sitong Su","Litao Guo","Lianli Gao","Hengtao Shen","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2311.16635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12476v4","updated":"2023-11-28T09:36:44Z","published":"2023-05-21T14:40:48Z","title":"Zero-shot Visual Relation Detection via Composite Visual Cues from Large\n Language Models","summary":" Pretrained vision-language models, such as CLIP, have demonstrated strong\ngeneralization capabilities, making them promising tools in the realm of\nzero-shot visual recognition. Visual relation detection (VRD) is a typical task\nthat identifies relationship (or interaction) types between object pairs within\nan image. However, naively utilizing CLIP with prevalent class-based prompts\nfor zero-shot VRD has several weaknesses, e.g., it struggles to distinguish\nbetween different fine-grained relation types and it neglects essential spatial\ninformation of two objects. To this end, we propose a novel method for\nzero-shot VRD: RECODE, which solves RElation detection via COmposite\nDEscription prompts. Specifically, RECODE first decomposes each predicate\ncategory into subject, object, and spatial components. Then, it leverages large\nlanguage models (LLMs) to generate description-based prompts (or visual cues)\nfor each component. Different visual cues enhance the discriminability of\nsimilar relation categories from different perspectives, which significantly\nboosts performance in VRD. To dynamically fuse different cues, we further\nintroduce a chain-of-thought method that prompts LLMs to generate reasonable\nweights for different visual cues. Extensive experiments on four VRD benchmarks\nhave demonstrated the effectiveness and interpretability of RECODE.\n","authors":["Lin Li","Jun Xiao","Guikun Chen","Jian Shao","Yueting Zhuang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2305.12476v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13620v2","updated":"2023-11-28T09:28:53Z","published":"2023-09-24T12:29:13Z","title":"PRIS: Practical robust invertible network for image steganography","summary":" Image steganography is a technique of hiding secret information inside\nanother image, so that the secret is not visible to human eyes and can be\nrecovered when needed. Most of the existing image steganography methods have\nlow hiding robustness when the container images affected by distortion. Such as\nGaussian noise and lossy compression. This paper proposed PRIS to improve the\nrobustness of image steganography, it based on invertible neural networks, and\nput two enhance modules before and after the extraction process with a 3-step\ntraining strategy. Moreover, rounding error is considered which is always\nignored by existing methods, but actually it is unavoidable in practical. A\ngradient approximation function (GAF) is also proposed to overcome the\nundifferentiable issue of rounding distortion. Experimental results show that\nour PRIS outperforms the state-of-the-art robust image steganography method in\nboth robustness and practicability. Codes are available at\nhttps://github.com/yanghangAI/PRIS, demonstration of our model in practical at\nhttp://yanghang.site/hide/.\n","authors":["Hang Yang","Yitian Xu","Xuhua Liu","Xiaodong Ma"],"pdf_url":"https://arxiv.org/pdf/2309.13620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16623v1","updated":"2023-11-28T09:24:42Z","published":"2023-11-28T09:24:42Z","title":"Visual Semantic Navigation with Real Robots","summary":" Visual Semantic Navigation (VSN) is the ability of a robot to learn visual\nsemantic information for navigating in unseen environments. These VSN models\nare typically tested in those virtual environments where they are trained,\nmainly using reinforcement learning based approaches. Therefore, we do not yet\nhave an in-depth analysis of how these models would behave in the real world.\nIn this work, we propose a new solution to integrate VSN models into real\nrobots, so that we have true embodied agents. We also release a novel ROS-based\nframework for VSN, ROS4VSN, so that any VSN-model can be easily deployed in any\nROS-compatible robot and tested in a real setting. Our experiments with two\ndifferent robots, where we have embedded two state-of-the-art VSN agents,\nconfirm that there is a noticeable performance difference of these VSN\nsolutions when tested in real-world and simulation environments. We hope that\nthis research will endeavor to provide a foundation for addressing this\nconsequential issue, with the ultimate aim of advancing the performance and\nefficiency of embodied agents within authentic real-world scenarios. Code to\nreproduce all our experiments can be found at\nhttps://github.com/gramuah/ros4vsn.\n","authors":["Carlos Gutiérrez-Álvarez","Pablo Ríos-Navarro","Rafael Flor-Rodríguez","Francisco Javier Acevedo-Rodríguez","Roberto J. López-Sastre"],"pdf_url":"https://arxiv.org/pdf/2311.16623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03782v2","updated":"2023-11-28T09:23:30Z","published":"2023-11-07T08:05:09Z","title":"CapST: An Enhanced and Lightweight Model Attribution Approach for\n Synthetic Videos","summary":" Deepfake videos, generated through AI faceswapping techniques, have garnered\nconsiderable attention due to their potential for powerful impersonation\nattacks. While existing research primarily focuses on binary classification to\ndiscern between real and fake videos, however determining the specific\ngeneration model for a fake video is crucial for forensic investigation.\nAddressing this gap, this paper investigates the model attribution problem of\nDeepfake videos from a recently proposed dataset, Deepfakes from Different\nModels (DFDM), derived from various Autoencoder models. The dataset comprises\n6,450 Deepfake videos generated by five distinct models with variations in\nencoder, decoder, intermediate layer, input resolution, and compression ratio.\nThis study formulates Deepfakes model attribution as a multiclass\nclassification task, proposing a segment of VGG19 as a feature extraction\nbackbone, known for its effectiveness in imagerelated tasks, while integrated a\nCapsule Network with a Spatio-Temporal attention mechanism. The Capsule module\ncaptures intricate hierarchies among features for robust identification of\ndeepfake attributes. Additionally, the video-level fusion technique leverages\ntemporal attention mechanisms to handle concatenated feature vectors,\ncapitalizing on inherent temporal dependencies in deepfake videos. By\naggregating insights across frames, our model gains a comprehensive\nunderstanding of video content, resulting in more precise predictions.\nExperimental results on the deepfake benchmark dataset (DFDM) demonstrate the\nefficacy of our proposed method, achieving up to a 4% improvement in accurately\ncategorizing deepfake videos compared to baseline models while demanding fewer\ncomputational resources.\n","authors":["Wasim Ahmad","Yan-Tsung Peng","Yuan-Hao Chang","Gaddisa Olani Ganfure","Sarwar Khan","Sahibzada Adil Shahzad"],"pdf_url":"https://arxiv.org/pdf/2311.03782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16618v1","updated":"2023-11-28T09:18:42Z","published":"2023-11-28T09:18:42Z","title":"Cross-level Attention with Overlapped Windows for Camouflaged Object\n Detection","summary":" Camouflaged objects adaptively fit their color and texture with the\nenvironment, which makes them indistinguishable from the surroundings. Current\nmethods revealed that high-level semantic features can highlight the\ndifferences between camouflaged objects and the backgrounds. Consequently, they\nintegrate high-level semantic features with low-level detailed features for\naccurate camouflaged object detection (COD). Unlike previous designs for\nmulti-level feature fusion, we state that enhancing low-level features is more\nimpending for COD. In this paper, we propose an overlapped window cross-level\nattention (OWinCA) to achieve the low-level feature enhancement guided by the\nhighest-level features. By sliding an aligned window pair on both the highest-\nand low-level feature maps, the high-level semantics are explicitly integrated\ninto the low-level details via cross-level attention. Additionally, it employs\nan overlapped window partition strategy to alleviate the incoherence among\nwindows, which prevents the loss of global information. These adoptions enable\nthe proposed OWinCA to enhance low-level features by promoting the separability\nof camouflaged objects. The associated proposed OWinCANet fuses these enhanced\nmulti-level features by simple convolution operation to achieve the final COD.\nExperiments conducted on three large-scale COD datasets demonstrate that our\nOWinCANet significantly surpasses the current state-of-the-art COD methods.\n","authors":["Jiepan Li","Fangxiao Lu","Nan Xue","Zhuohong Li","Hongyan Zhang","Wei He"],"pdf_url":"https://arxiv.org/pdf/2311.16618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03178v2","updated":"2023-11-28T09:07:46Z","published":"2023-01-09T06:02:36Z","title":"Deep Planar Parallax for Monocular Depth Estimation","summary":" Recent research has highlighted the utility of Planar Parallax Geometry in\nmonocular depth estimation. However, its potential has yet to be fully realized\nbecause networks rely heavily on appearance for depth prediction. Our in-depth\nanalysis reveals that utilizing flow-pretrain can optimize the network's usage\nof consecutive frame modeling, leading to substantial performance enhancement.\nAdditionally, we propose Planar Position Embedding (PPE) to handle dynamic\nobjects that defy static scene assumptions and to tackle slope variations that\nare challenging to differentiate. Comprehensive experiments on autonomous\ndriving datasets, namely KITTI and the Waymo Open Dataset (WOD), prove that our\nPlanar Parallax Network (PPNet) significantly surpasses existing learning-based\nmethods in performance.\n","authors":["Haoqian Liang","Zhichao Li","Ya Yang","Naiyan Wang"],"pdf_url":"https://arxiv.org/pdf/2301.03178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16613v1","updated":"2023-11-28T09:02:38Z","published":"2023-11-28T09:02:38Z","title":"Filter-Pruning of Lightweight Face Detectors Using a Geometric Median\n Criterion","summary":" Face detectors are becoming a crucial component of many applications,\nincluding surveillance, that often have to run on edge devices with limited\nprocessing power and memory. Therefore, there's a pressing demand for compact\nface detection models that can function efficiently across resource-constrained\ndevices. Over recent years, network pruning techniques have attracted a lot of\nattention from researchers. These methods haven't been well examined in the\ncontext of face detectors, despite their expanding popularity. In this paper,\nwe implement filter pruning on two already small and compact face detectors,\nnamed EXTD (Extremely Tiny Face Detector) and EResFD (Efficient ResNet Face\nDetector). The main pruning algorithm that we utilize is Filter Pruning via\nGeometric Median (FPGM), combined with the Soft Filter Pruning (SFP) iterative\nprocedure. We also apply L1 Norm pruning, as a baseline to compare with the\nproposed approach. The experimental evaluation on the WIDER FACE dataset\nindicates that the proposed approach has the potential to further reduce the\nmodel size of already lightweight face detectors, with limited accuracy loss,\nor even with small accuracy gain for low pruning rates.\n","authors":["Konstantinos Gkrispanis","Nikolaos Gkalelis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2311.16613v1.pdf","comment":"Accepted for publication in the IEEE/CVF WACV 2024 Workshops\n proceedings, Hawaii, USA, Jan. 2024"},{"id":"http://arxiv.org/abs/2308.00929v2","updated":"2023-11-28T08:57:12Z","published":"2023-08-02T04:10:14Z","title":"Towards Discriminative Representation with Meta-learning for\n Colonoscopic Polyp Re-Identification","summary":" Colonoscopic Polyp Re-Identification aims to match the same polyp from a\nlarge gallery with images from different views taken using different cameras\nand plays an important role in the prevention and treatment of colorectal\ncancer in computer-aided diagnosis. However, traditional methods for object\nReID directly adopting CNN models trained on the ImageNet dataset usually\nproduce unsatisfactory retrieval performance on colonoscopic datasets due to\nthe large domain gap. Additionally, these methods neglect to explore the\npotential of self-discrepancy among intra-class relations in the colonoscopic\npolyp dataset, which remains an open research problem in the medical community.\nTo solve this dilemma, we propose a simple but effective training method named\nColo-ReID, which can help our model learn more general and discriminative\nknowledge based on the meta-learning strategy in scenarios with fewer samples.\nBased on this, a dynamic Meta-Learning Regulation mechanism called MLR is\nintroduced to further boost the performance of polyp re-identification. To the\nbest of our knowledge, this is the first attempt to leverage the meta-learning\nparadigm instead of traditional machine learning algorithm to effectively train\ndeep models in the task of colonoscopic polyp re-identification. Empirical\nresults show that our method significantly outperforms current state-of-the-art\nmethods by a clear margin.\n","authors":["Suncheng Xiang","Qingzhong Chen","Shilun Cai","Chengfeng Zhou","Crystal Cai","Sijia Du","Zhengjie Zhang","Yunshi Zhong","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2308.00929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12793v2","updated":"2023-11-28T08:52:50Z","published":"2023-11-21T18:58:11Z","title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","summary":" In the realm of large multi-modal models (LMMs), efficient modality alignment\nis crucial yet often constrained by the scarcity of high-quality image-text\ndata. To address this bottleneck, we introduce the ShareGPT4V dataset, a\npioneering large-scale resource featuring 1.2 million highly descriptive\ncaptions, which surpasses existing datasets in diversity and information\ncontent, covering world knowledge, object properties, spatial relationships,\nand aesthetic evaluations. Specifically, ShareGPT4V originates from a curated\n100K high-quality captions collected from advanced GPT4-Vision and has been\nexpanded to 1.2M with a superb caption model trained on this subset. ShareGPT4V\nfirst demonstrates its effectiveness for the Supervised Fine-Tuning (SFT)\nphase, by substituting an equivalent quantity of detailed captions in existing\nSFT datasets with a subset of our high-quality captions, significantly\nenhancing the LMMs like LLaVA-7B, LLaVA-1.5-13B, and Qwen-VL-Chat-7B on the MME\nand MMBench benchmarks, with respective gains of 222.8/22.0/22.3 and\n2.7/1.3/1.5. We further incorporate ShareGPT4V data into both the pre-training\nand SFT phases, obtaining ShareGPT4V-7B, a superior LMM based on a simple\narchitecture that has remarkable performance across a majority of the\nmulti-modal benchmarks. This project is available at\nhttps://ShareGPT4V.github.io to serve as a pivotal resource for advancing the\nLMMs community.\n","authors":["Lin Chen","Jinsong Li","Xiaoyi Dong","Pan Zhang","Conghui He","Jiaqi Wang","Feng Zhao","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2311.12793v2.pdf","comment":"Project: https://ShareGPT4V.github.io"},{"id":"http://arxiv.org/abs/2310.08165v2","updated":"2023-11-28T08:45:13Z","published":"2023-10-12T09:37:56Z","title":"COVID-19 detection using ViT transformer-based approach from Computed\n Tomography Images","summary":" In here, we introduce a novel approach to enhance the accuracy and efficiency\nof COVID-19 diagnosis using CT images. Leveraging state-of-the-art Transformer\nmodels in computer vision, we employed the base ViT Transformer configured for\n224x224-sized input images, modifying the output to suit the binary\nclassification task. Notably, input images were resized from the standard CT\nscan size of 512x512 to match the model's expectations. Our method implements a\nsystematic patient-level prediction strategy, classifying individual CT slices\nas COVID-19 or non-COVID. To determine the overall diagnosis for each patient,\na majority voting approach as well as other thresholding approaches were\nemployed. This method involves evaluating all CT slices for a given patient and\nassigning the patient the diagnosis that relates to the thresholding for the CT\nscan. This meticulous patient-level prediction process contributes to the\nrobustness of our solution as it starts from 2D-slices to 3D-patient level.\nThroughout the evaluation process, our approach resulted in 0.7 macro F1 score\non the COV19-CT -DB validation set. To ensure the reliability and effectiveness\nof our model, we rigorously validate it on the extensive COV-19 CT dataset,\nwhich is meticulously annotated for the task. This dataset, with its\ncomprehensive annotations, reinforces the overall robustness of our solution.\n","authors":["Kenan Morani"],"pdf_url":"https://arxiv.org/pdf/2310.08165v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20604v2","updated":"2023-11-28T08:29:18Z","published":"2023-10-31T16:39:56Z","title":"Enhanced Synthetic MRI Generation from CT Scans Using CycleGAN with\n Feature Extraction","summary":" In the field of radiotherapy, accurate imaging and image registration are of\nutmost importance for precise treatment planning. Magnetic Resonance Imaging\n(MRI) offers detailed imaging without being invasive and excels in soft-tissue\ncontrast, making it a preferred modality for radiotherapy planning. However,\nthe high cost of MRI, longer acquisition time, and certain health\nconsiderations for patients pose challenges. Conversely, Computed Tomography\n(CT) scans offer a quicker and less expensive imaging solution. To bridge these\nmodalities and address multimodal alignment challenges, we introduce an\napproach for enhanced monomodal registration using synthetic MRI images.\nUtilizing unpaired data, this paper proposes a novel method to produce these\nsynthetic MRI images from CT scans, leveraging CycleGANs and feature\nextractors. By building upon the foundational work on Cycle-Consistent\nAdversarial Networks and incorporating advancements from related literature,\nour methodology shows promising results, outperforming several state-of-the-art\nmethods. The efficacy of our approach is validated by multiple comparison\nmetrics.\n","authors":["Saba Nikbakhsh","Lachin Naghashyar","Morteza Valizadeh","Mehdi Chehel Amirani"],"pdf_url":"https://arxiv.org/pdf/2310.20604v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.16593v1","updated":"2023-11-28T08:18:30Z","published":"2023-11-28T08:18:30Z","title":"Empowering COVID-19 Detection: Optimizing Performance Through Fine-Tuned\n EfficientNet Deep Learning Architecture","summary":" The worldwide COVID-19 pandemic has profoundly influenced the health and\neveryday experiences of individuals across the planet. It is a highly\ncontagious respiratory disease requiring early and accurate detection to curb\nits rapid transmission. Initial testing methods primarily revolved around\nidentifying the genetic composition of the coronavirus, exhibiting a relatively\nlow detection rate and requiring a time-intensive procedure. To address this\nchallenge, experts have suggested using radiological imagery, particularly\nchest X-rays, as a valuable approach within the diagnostic protocol. This study\ninvestigates the potential of leveraging radiographic imaging (X-rays) with\ndeep learning algorithms to swiftly and precisely identify COVID-19 patients.\nThe proposed approach elevates the detection accuracy by fine-tuning with\nappropriate layers on various established transfer learning models. The\nexperimentation was conducted on a COVID-19 X-ray dataset containing 2000\nimages. The accuracy rates achieved were impressive of 100% for EfficientNetB4\nmodel. The fine-tuned EfficientNetB4 achieved an excellent accuracy score,\nshowcasing its potential as a robust COVID-19 detection model. Furthermore,\nEfficientNetB4 excelled in identifying Lung disease using Chest X-ray dataset\ncontaining 4,350 Images, achieving remarkable performance with an accuracy of\n99.17%, precision of 99.13%, recall of 99.16%, and f1-score of 99.14%. These\nresults highlight the promise of fine-tuned transfer learning for efficient\nlung detection through medical imaging, especially with X-ray images. This\nresearch offers radiologists an effective means of aiding rapid and precise\nCOVID-19 diagnosis and contributes valuable assistance for healthcare\nprofessionals in accurately identifying affected patients.\n","authors":["Md. Alamin Talukder","Md. Abu Layek","Mohsin Kazi","Md Ashraf Uddin","Sunil Aryal"],"pdf_url":"https://arxiv.org/pdf/2311.16593v1.pdf","comment":"Computers in Biology and Medicine [Q1, IF: 7.7, CS: 9.2]"},{"id":"http://arxiv.org/abs/2311.16589v1","updated":"2023-11-28T08:15:27Z","published":"2023-11-28T08:15:27Z","title":"Improving Lane Detection Generalization: A Novel Framework using HD Maps\n for Boosting Diversity","summary":" Lane detection is a vital task for vehicles to navigate and localize their\nposition on the road. To ensure reliable results, lane detection algorithms\nmust have robust generalization performance in various road environments.\nHowever, despite the significant performance improvement of deep learning-based\nlane detection algorithms, their generalization performance in response to\nchanges in road environments still falls short of expectations. In this paper,\nwe present a novel framework for single-source domain generalization (SSDG) in\nlane detection. By decomposing data into lane structures and surroundings, we\nenhance diversity using High-Definition (HD) maps and generative models. Rather\nthan expanding data volume, we strategically select a core subset of data,\nmaximizing diversity and optimizing performance. Our extensive experiments\ndemonstrate that our framework enhances the generalization performance of lane\ndetection, comparable to the domain adaptation-based method.\n","authors":["Daeun Lee","Minhyeok Heo","Jiwon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.16589v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.16581v1","updated":"2023-11-28T07:55:25Z","published":"2023-11-28T07:55:25Z","title":"GeoScaler: Geometry and Rendering-Aware Downsampling of 3D Mesh Textures","summary":" High-resolution texture maps are necessary for representing real-world\nobjects accurately with 3D meshes. The large sizes of textures can bottleneck\nthe real-time rendering of high-quality virtual 3D scenes on devices having low\ncomputational budgets and limited memory. Downsampling the texture maps\ndirectly addresses the issue, albeit at the cost of visual fidelity.\nTraditionally, downsampling of texture maps is performed using methods like\nbicubic interpolation and the Lanczos algorithm. These methods ignore the\ngeometric layout of the mesh and its UV parametrization and also do not account\nfor the rendering process used to obtain the final visualization that the users\nwill experience. Towards filling these gaps, we introduce GeoScaler, which is a\nmethod of downsampling texture maps of 3D meshes while incorporating geometric\ncues, and by maximizing the visual fidelity of the rendered views of the\ntextured meshes. We show that the textures generated by GeoScaler deliver\nsignificantly better quality rendered images compared to those generated by\ntraditional downsampling methods\n","authors":["Sai Karthikey Pentapati","Anshul Rai","Arkady Ten","Chaitanya Atluru","Alan Bovik"],"pdf_url":"https://arxiv.org/pdf/2311.16581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16580v1","updated":"2023-11-28T07:54:27Z","published":"2023-11-28T07:54:27Z","title":"Clean Label Disentangling for Medical Image Segmentation with Noisy\n Labels","summary":" Current methods focusing on medical image segmentation suffer from incorrect\nannotations, which is known as the noisy label issue. Most medical image\nsegmentation with noisy labels methods utilize either noise transition matrix,\nnoise-robust loss functions or pseudo-labeling methods, while none of the\ncurrent research focuses on clean label disentanglement. We argue that the main\nreason is that the severe class-imbalanced issue will lead to the inaccuracy of\nthe selected ``clean'' labels, thus influencing the robustness of the model\nagainst the noises. In this work, we come up with a simple but efficient\nclass-balanced sampling strategy to tackle the class-imbalanced problem, which\nenables our newly proposed clean label disentangling framework to successfully\nselect clean labels from the given label sets and encourages the model to learn\nfrom the correct annotations. However, such a method will filter out too many\nannotations which may also contain useful information. Therefore, we further\nextend our clean label disentangling framework to a new noisy feature-aided\nclean label disentangling framework, which takes the full annotations into\nutilization to learn more semantics. Extensive experiments have validated the\neffectiveness of our methods, where our methods achieve new state-of-the-art\nperformance. Our code is available at https://github.com/xiaoyao3302/2BDenoise.\n","authors":["Zicheng Wang","Zhen Zhao","Erjian Guo","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.16580v1.pdf","comment":"13 pages, 6 figures, 11 tables"},{"id":"http://arxiv.org/abs/2311.16577v1","updated":"2023-11-28T07:40:16Z","published":"2023-11-28T07:40:16Z","title":"Efficient Key-Based Adversarial Defense for ImageNet by Using\n Pre-trained Model","summary":" In this paper, we propose key-based defense model proliferation by leveraging\npre-trained models and utilizing recent efficient fine-tuning techniques on\nImageNet-1k classification. First, we stress that deploying key-based models on\nedge devices is feasible with the latest model deployment advancements, such as\nApple CoreML, although the mainstream enterprise edge artificial intelligence\n(Edge AI) has been focused on the Cloud. Then, we point out that the previous\nkey-based defense on on-device image classification is impractical for two\nreasons: (1) training many classifiers from scratch is not feasible, and (2)\nkey-based defenses still need to be thoroughly tested on large datasets like\nImageNet. To this end, we propose to leverage pre-trained models and utilize\nefficient fine-tuning techniques to proliferate key-based models even on\nlimited computing resources. Experiments were carried out on the ImageNet-1k\ndataset using adaptive and non-adaptive attacks. The results show that our\nproposed fine-tuned key-based models achieve a superior classification accuracy\n(more than 10% increase) compared to the previous key-based models on\nclassifying clean and adversarial examples.\n","authors":["AprilPyone MaungMaung","Isao Echizen","Hitoshi Kiya"],"pdf_url":"https://arxiv.org/pdf/2311.16577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16567v1","updated":"2023-11-28T07:14:41Z","published":"2023-11-28T07:14:41Z","title":"MobileDiffusion: Subsecond Text-to-Image Generation on Mobile Devices","summary":" The deployment of large-scale text-to-image diffusion models on mobile\ndevices is impeded by their substantial model size and slow inference speed. In\nthis paper, we propose \\textbf{MobileDiffusion}, a highly efficient\ntext-to-image diffusion model obtained through extensive optimizations in both\narchitecture and sampling techniques. We conduct a comprehensive examination of\nmodel architecture design to reduce redundancy, enhance computational\nefficiency, and minimize model's parameter count, while preserving image\ngeneration quality. Additionally, we employ distillation and diffusion-GAN\nfinetuning techniques on MobileDiffusion to achieve 8-step and 1-step inference\nrespectively. Empirical studies, conducted both quantitatively and\nqualitatively, demonstrate the effectiveness of our proposed techniques.\nMobileDiffusion achieves a remarkable \\textbf{sub-second} inference speed for\ngenerating a $512\\times512$ image on mobile devices, establishing a new state\nof the art.\n","authors":["Yang Zhao","Yanwu Xu","Zhisheng Xiao","Tingbo Hou"],"pdf_url":"https://arxiv.org/pdf/2311.16567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16495v1","updated":"2023-11-28T07:13:47Z","published":"2023-11-28T07:13:47Z","title":"Egocentric Whole-Body Motion Capture with FisheyeViT and Diffusion-Based\n Motion Refinement","summary":" In this work, we explore egocentric whole-body motion capture using a single\nfisheye camera, which simultaneously estimates human body and hand motion. This\ntask presents significant challenges due to three factors: the lack of\nhigh-quality datasets, fisheye camera distortion, and human body\nself-occlusion. To address these challenges, we propose a novel approach that\nleverages FisheyeViT to extract fisheye image features, which are subsequently\nconverted into pixel-aligned 3D heatmap representations for 3D human body pose\nprediction. For hand tracking, we incorporate dedicated hand detection and hand\npose estimation networks for regressing 3D hand poses. Finally, we develop a\ndiffusion-based whole-body motion prior model to refine the estimated\nwhole-body motion while accounting for joint uncertainties. To train these\nnetworks, we collect a large synthetic dataset, EgoWholeBody, comprising\n840,000 high-quality egocentric images captured across a diverse range of\nwhole-body motion sequences. Quantitative and qualitative evaluations\ndemonstrate the effectiveness of our method in producing high-quality\nwhole-body motion estimates from a single egocentric camera.\n","authors":["Jian Wang","Zhe Cao","Diogo Luvizon","Lingjie Liu","Kripasindhu Sarkar","Danhang Tang","Thabo Beeler","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2311.16495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16565v1","updated":"2023-11-28T07:13:20Z","published":"2023-11-28T07:13:20Z","title":"DiffusionTalker: Personalization and Acceleration for Speech-Driven 3D\n Face Diffuser","summary":" Speech-driven 3D facial animation has been an attractive task in both\nacademia and industry. Traditional methods mostly focus on learning a\ndeterministic mapping from speech to animation. Recent approaches start to\nconsider the non-deterministic fact of speech-driven 3D face animation and\nemploy the diffusion model for the task. However, personalizing facial\nanimation and accelerating animation generation are still two major limitations\nof existing diffusion-based methods. To address the above limitations, we\npropose DiffusionTalker, a diffusion-based method that utilizes contrastive\nlearning to personalize 3D facial animation and knowledge distillation to\naccelerate 3D animation generation. Specifically, to enable personalization, we\nintroduce a learnable talking identity to aggregate knowledge in audio\nsequences. The proposed identity embeddings extract customized facial cues\nacross different people in a contrastive learning manner. During inference,\nusers can obtain personalized facial animation based on input audio, reflecting\na specific talking style. With a trained diffusion model with hundreds of\nsteps, we distill it into a lightweight model with 8 steps for acceleration.\nExtensive experiments are conducted to demonstrate that our method outperforms\nstate-of-the-art methods. The code will be released.\n","authors":["Peng Chen","Xiaobao Wei","Ming Lu","Yitong Zhu","Naiming Yao","Xingyu Xiao","Hui Chen"],"pdf_url":"https://arxiv.org/pdf/2311.16565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16555v1","updated":"2023-11-28T06:51:28Z","published":"2023-11-28T06:51:28Z","title":"Enhancing Scene Text Detectors with Realistic Text Image Synthesis Using\n Diffusion Models","summary":" Scene text detection techniques have garnered significant attention due to\ntheir wide-ranging applications. However, existing methods have a high demand\nfor training data, and obtaining accurate human annotations is labor-intensive\nand time-consuming. As a solution, researchers have widely adopted synthetic\ntext images as a complementary resource to real text images during\npre-training. Yet there is still room for synthetic datasets to enhance the\nperformance of scene text detectors. We contend that one main limitation of\nexisting generation methods is the insufficient integration of foreground text\nwith the background. To alleviate this problem, we present the Diffusion Model\nbased Text Generator (DiffText), a pipeline that utilizes the diffusion model\nto seamlessly blend foreground text regions with the background's intrinsic\nfeatures. Additionally, we propose two strategies to generate visually coherent\ntext with fewer spelling errors. With fewer text instances, our produced text\nimages consistently surpass other synthetic data in aiding text detectors.\nExtensive experiments on detecting horizontal, rotated, curved, and line-level\ntexts demonstrate the effectiveness of DiffText in producing realistic text\nimages.\n","authors":["Ling Fu","Zijie Wu","Yingying Zhu","Yuliang Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2311.16555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16552v1","updated":"2023-11-28T06:42:44Z","published":"2023-11-28T06:42:44Z","title":"HandyPriors: Physically Consistent Perception of Hand-Object\n Interactions with Differentiable Priors","summary":" Various heuristic objectives for modeling hand-object interaction have been\nproposed in past work. However, due to the lack of a cohesive framework, these\nobjectives often possess a narrow scope of applicability and are limited by\ntheir efficiency or accuracy. In this paper, we propose HandyPriors, a unified\nand general pipeline for pose estimation in human-object interaction scenes by\nleveraging recent advances in differentiable physics and rendering. Our\napproach employs rendering priors to align with input images and segmentation\nmasks along with physics priors to mitigate penetration and relative-sliding\nacross frames. Furthermore, we present two alternatives for hand and object\npose estimation. The optimization-based pose estimation achieves higher\naccuracy, while the filtering-based tracking, which utilizes the differentiable\npriors as dynamics and observation models, executes faster. We demonstrate that\nHandyPriors attains comparable or superior results in the pose estimation task,\nand that the differentiable physics module can predict contact information for\npose refinement. We also show that our approach generalizes to perception\ntasks, including robotic hand manipulation and human-object pose estimation in\nthe wild.\n","authors":["Shutong Zhang","Yi-Ling Qiao","Guanglei Zhu","Eric Heiden","Dylan Turpin","Jingzhou Liu","Ming Lin","Miles Macklin","Animesh Garg"],"pdf_url":"https://arxiv.org/pdf/2311.16552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16544v1","updated":"2023-11-28T06:25:26Z","published":"2023-11-28T06:25:26Z","title":"Multi-Irreducible Spectral Synchronization for Robust Rotation Averaging","summary":" Rotation averaging (RA) is a fundamental problem in robotics and computer\nvision. In RA, the goal is to estimate a set of $N$ unknown orientations\n$R_{1}, ..., R_{N} \\in SO(3)$, given noisy measurements $R_{ij} \\sim R^{-1}_{i}\nR_{j}$ of a subset of their pairwise relative rotations. This problem is both\nnonconvex and NP-hard, and thus difficult to solve in the general case. We\napply harmonic analysis on compact groups to derive a (convex) spectral\nrelaxation constructed from truncated Fourier decompositions of the individual\nsummands appearing in the RA objective; we then recover an estimate of the RA\nsolution by computing a few extremal eigenpairs of this relaxation, and\n(approximately) solving a consensus problem. Our approach affords several\nnotable advantages versus prior RA methods: it can be used in conjunction with\n\\emph{any} smooth loss function (including, but not limited to, robust\nM-estimators), does not require any initialization, and is implemented using\nonly simple (and highly scalable) linear-algebraic computations and\nparallelizable optimizations over band-limited functions of individual\nrotational states. Moreover, under the (physically well-motivated) assumption\nof multiplicative Langevin measurement noise, we derive explicit performance\nguarantees for our spectral estimator (in the form of probabilistic tail bounds\non the estimation error) that are parameterized in terms of graph-theoretic\nquantities of the underlying measurement network. By concretely linking\nestimator performance with properties of the underlying measurement graph, our\nresults also indicate how to devise measurement networks that are\n\\emph{guaranteed} to achieve accurate estimation, enabling such downstream\ntasks as sensor placement, network compression, and active sensing.\n","authors":["Owen Howell","Haoen Huang","David Rosen"],"pdf_url":"https://arxiv.org/pdf/2311.16544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16507v1","updated":"2023-11-28T06:19:30Z","published":"2023-11-28T06:19:30Z","title":"Exploring Straighter Trajectories of Flow Matching with Diffusion\n Guidance","summary":" Flow matching as a paradigm of generative model achieves notable success\nacross various domains. However, existing methods use either multi-round\ntraining or knowledge within minibatches, posing challenges in finding a\nfavorable coupling strategy for straight trajectories. To address this issue,\nwe propose a novel approach, Straighter trajectories of Flow Matching\n(StraightFM). It straightens trajectories with the coupling strategy guided by\ndiffusion model from entire distribution level. First, we propose a coupling\nstrategy to straighten trajectories, creating couplings between image and noise\nsamples under diffusion model guidance. Second, StraightFM also integrates real\ndata to enhance training, employing a neural network to parameterize another\ncoupling process from images to noise samples. StraightFM is jointly optimized\nwith couplings from above two mutually complementary directions, resulting in\nstraighter trajectories and enabling both one-step and few-step generation.\nExtensive experiments demonstrate that StraightFM yields high quality samples\nwith fewer step. StraightFM generates visually appealing images with a lower\nFID among diffusion and traditional flow matching methods within 5 sampling\nsteps when trained on pixel space. In the latent space (i.e., Latent\nDiffusion), StraightFM achieves a lower KID value compared to existing methods\non the CelebA-HQ 256 dataset in fewer than 10 sampling steps.\n","authors":["Siyu Xing","Jie Cao","Huaibo Huang","Xiao-Yu Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2311.16507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20323v2","updated":"2023-11-28T06:18:33Z","published":"2023-10-31T09:58:11Z","title":"SemanticBoost: Elevating Motion Generation with Augmented Textual Cues","summary":" Current techniques face difficulties in generating motions from intricate\nsemantic descriptions, primarily due to insufficient semantic annotations in\ndatasets and weak contextual understanding. To address these issues, we present\nSemanticBoost, a novel framework that tackles both challenges simultaneously.\nOur framework comprises a Semantic Enhancement module and a Context-Attuned\nMotion Denoiser (CAMD). The Semantic Enhancement module extracts supplementary\nsemantics from motion data, enriching the dataset's textual description and\nensuring precise alignment between text and motion data without depending on\nlarge language models. On the other hand, the CAMD approach provides an\nall-encompassing solution for generating high-quality, semantically consistent\nmotion sequences by effectively capturing context information and aligning the\ngenerated motion with the given textual descriptions. Distinct from existing\nmethods, our approach can synthesize accurate orientational movements, combined\nmotions based on specific body part descriptions, and motions generated from\ncomplex, extended sentences. Our experimental results demonstrate that\nSemanticBoost, as a diffusion-based method, outperforms auto-regressive-based\ntechniques, achieving cutting-edge performance on the Humanml3D dataset while\nmaintaining realistic and smooth motion generation quality.\n","authors":["Xin He","Shaoli Huang","Xiaohang Zhan","Chao Weng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2310.20323v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16542v1","updated":"2023-11-28T06:16:30Z","published":"2023-11-28T06:16:30Z","title":"Agents meet OKR: An Object and Key Results Driven Agent System with\n Hierarchical Self-Collaboration and Self-Evaluation","summary":" In this study, we introduce the concept of OKR-Agent designed to enhance the\ncapabilities of Large Language Models (LLMs) in task-solving. Our approach\nutilizes both self-collaboration and self-correction mechanism, facilitated by\nhierarchical agents, to address the inherent complexities in task-solving. Our\nkey observations are two-fold: first, effective task-solving demands in-depth\ndomain knowledge and intricate reasoning, for which deploying specialized\nagents for individual sub-tasks can markedly enhance LLM performance. Second,\ntask-solving intrinsically adheres to a hierarchical execution structure,\ncomprising both high-level strategic planning and detailed task execution.\nTowards this end, our OKR-Agent paradigm aligns closely with this hierarchical\nstructure, promising enhanced efficacy and adaptability across a range of\nscenarios. Specifically, our framework includes two novel modules: hierarchical\nObjects and Key Results generation and multi-level evaluation, each\ncontributing to more efficient and robust task-solving. In practical,\nhierarchical OKR generation decomposes Objects into multiple sub-Objects and\nassigns new agents based on key results and agent responsibilities. These\nagents subsequently elaborate on their designated tasks and may further\ndecompose them as necessary. Such generation operates recursively and\nhierarchically, culminating in a comprehensive set of detailed solutions. The\nmulti-level evaluation module of OKR-Agent refines solution by leveraging\nfeedback from all associated agents, optimizing each step of the process. This\nensures solution is accurate, practical, and effectively address intricate task\nrequirements, enhancing the overall reliability and quality of the outcome.\nExperimental results also show our method outperforms the previous methods on\nseveral tasks. Code and demo are available at https://okr-agent.github.io/\n","authors":["Yi Zheng","Chongyang Ma","Kanle Shi","Haibin Huang"],"pdf_url":"https://arxiv.org/pdf/2311.16542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13607v2","updated":"2023-11-28T06:16:03Z","published":"2023-09-24T11:04:50Z","title":"MM-NeRF: Multimodal-Guided 3D Multi-Style Transfer of Neural Radiance\n Field","summary":" 3D style transfer aims to generate stylized views of 3D scenes with specified\nstyles, which requires high-quality generating and keeping multi-view\nconsistency. Existing methods still suffer the challenges of high-quality\nstylization with texture details and stylization with multimodal guidance. In\nthis paper, we reveal that the common training method of stylization with NeRF,\nwhich generates stylized multi-view supervision by 2D style transfer models,\ncauses the same object in supervision to show various states (color tone,\ndetails, etc.) in different views, leading NeRF to tend to smooth the texture\ndetails, further resulting in low-quality rendering for 3D multi-style\ntransfer. To tackle these problems, we propose a novel Multimodal-guided 3D\nMulti-style transfer of NeRF, termed MM-NeRF. First, MM-NeRF projects\nmultimodal guidance into a unified space to keep the multimodal styles\nconsistency and extracts multimodal features to guide the 3D stylization.\nSecond, a novel multi-head learning scheme is proposed to relieve the\ndifficulty of learning multi-style transfer, and a multi-view style consistent\nloss is proposed to track the inconsistency of multi-view supervision data.\nFinally, a novel incremental learning mechanism to generalize MM-NeRF to any\nnew style with small costs. Extensive experiments on several real-world\ndatasets show that MM-NeRF achieves high-quality 3D multi-style stylization\nwith multimodal guidance, and keeps multi-view consistency and style\nconsistency between multimodal guidance. Codes will be released.\n","authors":["Zijiang Yang","Zhongwei Qiu","Chang Xu","Dongmei Fu"],"pdf_url":"https://arxiv.org/pdf/2309.13607v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.05228v2","updated":"2023-11-28T06:09:31Z","published":"2022-11-07T09:38:34Z","title":"FIXED: Frustratingly Easy Domain Generalization with Mixup","summary":" Domain generalization (DG) aims to learn a generalizable model from multiple\ntraining domains such that it can perform well on unseen target domains. A\npopular strategy is to augment training data to benefit generalization through\nmethods such as Mixup~\\cite{zhang2018mixup}. While the vanilla Mixup can be\ndirectly applied, theoretical and empirical investigations uncover several\nshortcomings that limit its performance. Firstly, Mixup cannot effectively\nidentify the domain and class information that can be used for learning\ninvariant representations. Secondly, Mixup may introduce synthetic noisy data\npoints via random interpolation, which lowers its discrimination capability.\nBased on the analysis, we propose a simple yet effective enhancement for\nMixup-based DG, namely domain-invariant Feature mIXup (FIX). It learns\ndomain-invariant representations for Mixup. To further enhance discrimination,\nwe leverage existing techniques to enlarge margins among classes to further\npropose the domain-invariant Feature MIXup with Enhanced Discrimination (FIXED)\napproach. We present theoretical insights about guarantees on its\neffectiveness. Extensive experiments on seven public datasets across two\nmodalities including image classification (Digits-DG, PACS, Office-Home) and\ntime series (DSADS, PAMAP2, UCI-HAR, and USC-HAD) demonstrate that our approach\nsignificantly outperforms nine state-of-the-art related methods, beating the\nbest performing baseline by 6.5\\% on average in terms of test accuracy. Code is\navailable at:\nhttps://github.com/jindongwang/transferlearning/tree/master/code/deep/fixed.\n","authors":["Wang Lu","Jindong Wang","Han Yu","Lei Huang","Xiang Zhang","Yiqiang Chen","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2211.05228v2.pdf","comment":"First Conference on Parsimony and Learning (CPAL) 2024; code for DG\n at: https://github.com/jindongwang/transferlearning/tree/master/code/DeepDG"},{"id":"http://arxiv.org/abs/2305.18766v3","updated":"2023-11-28T05:09:02Z","published":"2023-05-30T05:56:58Z","title":"HiFA: High-fidelity Text-to-3D Generation with Advanced Diffusion\n Guidance","summary":" The advancements in automatic text-to-3D generation have been remarkable.\nMost existing methods use pre-trained text-to-image diffusion models to\noptimize 3D representations like Neural Radiance Fields (NeRFs) via\nlatent-space denoising score matching. Yet, these methods often result in\nartifacts and inconsistencies across different views due to their suboptimal\noptimization approaches and limited understanding of 3D geometry. Moreover, the\ninherent constraints of NeRFs in rendering crisp geometry and stable textures\nusually lead to a two-stage optimization to attain high-resolution details.\nThis work proposes holistic sampling and smoothing approaches to achieve\nhigh-quality text-to-3D generation, all in a single-stage optimization. We\ncompute denoising scores in the text-to-image diffusion model's latent and\nimage spaces. Instead of randomly sampling timesteps (also referred to as noise\nlevels in denoising score matching), we introduce a novel timestep annealing\napproach that progressively reduces the sampled timestep throughout\noptimization. To generate high-quality renderings in a single-stage\noptimization, we propose regularization for the variance of z-coordinates along\nNeRF rays. To address texture flickering issues in NeRFs, we introduce a kernel\nsmoothing technique that refines importance sampling weights coarse-to-fine,\nensuring accurate and thorough sampling in high-density regions. Extensive\nexperiments demonstrate the superiority of our method over previous approaches,\nenabling the generation of highly detailed and view-consistent 3D assets\nthrough a single-stage training process.\n","authors":["Junzhe Zhu","Peiye Zhuang"],"pdf_url":"https://arxiv.org/pdf/2305.18766v3.pdf","comment":"Project page: https://hifa-team.github.io/HiFA-site/"},{"id":"http://arxiv.org/abs/2311.16524v1","updated":"2023-11-28T05:06:22Z","published":"2023-11-28T05:06:22Z","title":"3D Teeth Reconstruction from Panoramic Radiographs using Neural Implicit\n Functions","summary":" Panoramic radiography is a widely used imaging modality in dental practice\nand research. However, it only provides flattened 2D images, which limits the\ndetailed assessment of dental structures. In this paper, we propose Occudent, a\nframework for 3D teeth reconstruction from panoramic radiographs using neural\nimplicit functions, which, to the best of our knowledge, is the first work to\ndo so. For a given point in 3D space, the implicit function estimates whether\nthe point is occupied by a tooth, and thus implicitly determines the boundaries\nof 3D tooth shapes. Firstly, Occudent applies multi-label segmentation to the\ninput panoramic radiograph. Next, tooth shape embeddings as well as tooth class\nembeddings are generated from the segmentation outputs, which are fed to the\nreconstruction network. A novel module called Conditional eXcitation (CX) is\nproposed in order to effectively incorporate the combined shape and class\nembeddings into the implicit function. The performance of Occudent is evaluated\nusing both quantitative and qualitative measures. Importantly, Occudent is\ntrained and validated with actual panoramic radiographs as input, distinct from\nrecent works which used synthesized images. Experiments demonstrate the\nsuperiority of Occudent over state-of-the-art methods.\n","authors":["Sihwa Park","Seongjun Kim","In-Seok Song","Seung Jun Baek"],"pdf_url":"https://arxiv.org/pdf/2311.16524v1.pdf","comment":"12 pages, 2 figures, accepted to International Conference on Medical\n Image Computing and Computer-Assisted Intervention MICCAI 2023"},{"id":"http://arxiv.org/abs/2303.06842v4","updated":"2023-11-28T04:50:46Z","published":"2023-03-13T04:16:42Z","title":"Hierarchical Relationships: A New Perspective to Enhance Scene Graph\n Generation","summary":" This paper presents a finding that leveraging the hierarchical structures\namong labels for relationships and objects can substantially improve the\nperformance of scene graph generation systems. The focus of this work is to\ncreate an informative hierarchical structure that can divide object and\nrelationship categories into disjoint super-categories in a systematic way.\nSpecifically, we introduce a Bayesian prediction head to jointly predict the\nsuper-category of relationships between a pair of object instances, as well as\nthe detailed relationship within that super-category simultaneously,\nfacilitating more informative predictions. The resulting model exhibits the\ncapability to produce a more extensive set of predicates beyond the dataset\nannotations, and to tackle the prevalent issue of low annotation quality. While\nour paper presents preliminary findings, experiments on the Visual Genome\ndataset show its strong performance, particularly in predicate classifications\nand zero-shot settings, that demonstrates the promise of our approach.\n","authors":["Bowen Jiang","Camillo J. Taylor"],"pdf_url":"https://arxiv.org/pdf/2303.06842v4.pdf","comment":"NeurIPS 2023 New Frontiers in Graph Learning Workshop (NeurIPS\n GLFrontiers 2023); NeurIPS 2023 Queer in AI Workshop"},{"id":"http://arxiv.org/abs/2311.16488v1","updated":"2023-11-28T04:34:44Z","published":"2023-11-28T04:34:44Z","title":"Efficient Multimodal Diffusion Models Using Joint Data Infilling with\n Partially Shared U-Net","summary":" Recently, diffusion models have been used successfully to fit distributions\nfor cross-modal data translation and multimodal data generation. However, these\nmethods rely on extensive scaling, overlooking the inefficiency and\ninterference between modalities. We develop Partially Shared U-Net (PS-U-Net)\narchitecture which is an efficient multimodal diffusion model that allows text\nand image inputs to pass through dedicated layers and skip-connections for\npreserving modality-specific fine-grained details. Inspired by image\ninpainting, we also propose a new efficient multimodal sampling method that\nintroduces new scenarios for conditional generation while only requiring a\nsimple joint distribution to be learned. Our empirical exploration of the\nMS-COCO dataset demonstrates that our method generates multimodal text and\nimage data with higher quality compared to existing multimodal diffusion models\nwhile having a comparable size, faster training, faster multimodal sampling,\nand more flexible generation.\n","authors":["Zizhao Hu","Shaochong Jia","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2311.16488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15868v3","updated":"2023-11-28T04:28:48Z","published":"2023-06-28T01:50:46Z","title":"GraSS: Contrastive Learning with Gradient Guided Sampling Strategy for\n Remote Sensing Image Semantic Segmentation","summary":" Self-supervised contrastive learning (SSCL) has achieved significant\nmilestones in remote sensing image (RSI) understanding. Its essence lies in\ndesigning an unsupervised instance discrimination pretext task to extract image\nfeatures from a large number of unlabeled images that are beneficial for\ndownstream tasks. However, existing instance discrimination based SSCL suffer\nfrom two limitations when applied to the RSI semantic segmentation task: 1)\nPositive sample confounding issue; 2) Feature adaptation bias. It introduces a\nfeature adaptation bias when applied to semantic segmentation tasks that\nrequire pixel-level or object-level features. In this study, We observed that\nthe discrimination information can be mapped to specific regions in RSI through\nthe gradient of unsupervised contrastive loss, these specific regions tend to\ncontain singular ground objects. Based on this, we propose contrastive learning\nwith Gradient guided Sampling Strategy (GraSS) for RSI semantic segmentation.\nGraSS consists of two stages: Instance Discrimination warm-up (ID warm-up) and\nGradient guided Sampling contrastive training (GS training). The ID warm-up\naims to provide initial discrimination information to the contrastive loss\ngradients. The GS training stage aims to utilize the discrimination information\ncontained in the contrastive loss gradients and adaptively select regions in\nRSI patches that contain more singular ground objects, in order to construct\nnew positive and negative samples. Experimental results on three open datasets\ndemonstrate that GraSS effectively enhances the performance of SSCL in\nhigh-resolution RSI semantic segmentation. Compared to seven baseline methods\nfrom five different types of SSCL, GraSS achieves an average improvement of\n1.57\\% and a maximum improvement of 3.58\\% in terms of mean intersection over\nthe union. The source code is available at https://github.com/GeoX-Lab/GraSS\n","authors":["Zhaoyang Zhang","Zhen Ren","Chao Tao","Yunsheng Zhang","Chengli Peng","Haifeng Li"],"pdf_url":"https://arxiv.org/pdf/2306.15868v3.pdf","comment":"14 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.16471v1","updated":"2023-11-28T04:13:49Z","published":"2023-11-28T04:13:49Z","title":"A Unified Framework for Multimodal, Multi-Part Human Motion Synthesis","summary":" The field has made significant progress in synthesizing realistic human\nmotion driven by various modalities. Yet, the need for different methods to\nanimate various body parts according to different control signals limits the\nscalability of these techniques in practical scenarios. In this paper, we\nintroduce a cohesive and scalable approach that consolidates multimodal (text,\nmusic, speech) and multi-part (hand, torso) human motion generation. Our\nmethodology unfolds in several steps: We begin by quantizing the motions of\ndiverse body parts into separate codebooks tailored to their respective\ndomains. Next, we harness the robust capabilities of pre-trained models to\ntranscode multimodal signals into a shared latent space. We then translate\nthese signals into discrete motion tokens by iteratively predicting subsequent\ntokens to form a complete sequence. Finally, we reconstruct the continuous\nactual motion from this tokenized sequence. Our method frames the multimodal\nmotion generation challenge as a token prediction task, drawing from\nspecialized codebooks based on the modality of the control signal. This\napproach is inherently scalable, allowing for the easy integration of new\nmodalities. Extensive experiments demonstrated the effectiveness of our design,\nemphasizing its potential for broad application.\n","authors":["Zixiang Zhou","Yu Wan","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.16471v1.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2311.16468v1","updated":"2023-11-28T04:10:07Z","published":"2023-11-28T04:10:07Z","title":"AvatarGPT: All-in-One Framework for Motion Understanding, Planning,\n Generation and Beyond","summary":" Large Language Models(LLMs) have shown remarkable emergent abilities in\nunifying almost all (if not every) NLP tasks. In the human motion-related\nrealm, however, researchers still develop siloed models for each task. Inspired\nby InstuctGPT, and the generalist concept behind Gato, we introduce AvatarGPT,\nan All-in-One framework for motion understanding, planning, generations as well\nas other tasks such as motion in-between synthesis. AvatarGPT treats each task\nas one type of instruction fine-tuned on the shared LLM. All the tasks are\nseamlessly interconnected with language as the universal interface,\nconstituting a closed-loop within the framework. To achieve this, human motion\nsequences are first encoded as discrete tokens, which serve as the extended\nvocabulary of LLM. Then, an unsupervised pipeline to generate natural language\ndescriptions of human action sequences from in-the-wild videos is developed.\nFinally, all tasks are jointly trained. Extensive experiments show that\nAvatarGPT achieves SOTA on low-level tasks, and promising results on high-level\ntasks, demonstrating the effectiveness of our proposed All-in-One framework.\nMoreover, for the first time, AvatarGPT enables a principled approach by\niterative traversal of the tasks within the closed-loop for unlimited\nlong-motion synthesis.\n","authors":["Zixiang Zhou","Yu Wan","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.16468v1.pdf","comment":"22 pages, 21 figures"},{"id":"http://arxiv.org/abs/2311.08223v2","updated":"2023-11-28T04:05:03Z","published":"2023-11-14T15:01:58Z","title":"Improving Image Captioning via Predicting Structured Concepts","summary":" Having the difficulty of solving the semantic gap between images and texts\nfor the image captioning task, conventional studies in this area paid some\nattention to treating semantic concepts as a bridge between the two modalities\nand improved captioning performance accordingly. Although promising results on\nconcept prediction were obtained, the aforementioned studies normally ignore\nthe relationship among concepts, which relies on not only objects in the image,\nbut also word dependencies in the text, so that offers a considerable potential\nfor improving the process of generating good descriptions. In this paper, we\npropose a structured concept predictor (SCP) to predict concepts and their\nstructures, then we integrate them into captioning, so as to enhance the\ncontribution of visual signals in this task via concepts and further use their\nrelations to distinguish cross-modal semantics for better description\ngeneration. Particularly, we design weighted graph convolutional networks\n(W-GCN) to depict concept relations driven by word dependencies, and then\nlearns differentiated contributions from these concepts for following decoding\nprocess. Therefore, our approach captures potential relations among concepts\nand discriminatively learns different concepts, so that effectively facilitates\nimage captioning with inherited information across modalities. Extensive\nexperiments and their results demonstrate the effectiveness of our approach as\nwell as each proposed module in this work.\n","authors":["Ting Wang","Weidong Chen","Yuanhe Tian","Yan Song","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2311.08223v2.pdf","comment":"Accepted by EMNLP 2023 (Main Conference, Oral)"},{"id":"http://arxiv.org/abs/2311.16465v1","updated":"2023-11-28T04:02:40Z","published":"2023-11-28T04:02:40Z","title":"TextDiffuser-2: Unleashing the Power of Language Models for Text\n Rendering","summary":" The diffusion model has been proven a powerful generative model in recent\nyears, yet remains a challenge in generating visual text. Several methods\nalleviated this issue by incorporating explicit text position and content as\nguidance on where and what text to render. However, these methods still suffer\nfrom several drawbacks, such as limited flexibility and automation, constrained\ncapability of layout prediction, and restricted style diversity. In this paper,\nwe present TextDiffuser-2, aiming to unleash the power of language models for\ntext rendering. Firstly, we fine-tune a large language model for layout\nplanning. The large language model is capable of automatically generating\nkeywords for text rendering and also supports layout modification through\nchatting. Secondly, we utilize the language model within the diffusion model to\nencode the position and texts at the line level. Unlike previous methods that\nemployed tight character-level guidance, this approach generates more diverse\ntext images. We conduct extensive experiments and incorporate user studies\ninvolving human participants as well as GPT-4V, validating TextDiffuser-2's\ncapacity to achieve a more rational text layout and generation with enhanced\ndiversity. The code and model will be available at\n\\url{https://aka.ms/textdiffuser-2}.\n","authors":["Jingye Chen","Yupan Huang","Tengchao Lv","Lei Cui","Qifeng Chen","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2311.16465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.06296v2","updated":"2023-11-28T03:58:28Z","published":"2021-09-13T20:12:42Z","title":"Monocular Camera Localization for Automated Vehicles Using Image\n Retrieval","summary":" We address the problem of finding the current position and heading angle of\nan autonomous vehicle in real-time using a single camera. Compared to methods\nwhich require LiDARs and high definition (HD) 3D maps in real-time, the\nproposed approach is easily scalable and computationally efficient, at the\nprice of lower precision.\n The new method combines and adapts existing algorithms in three different\nfields: image retrieval, mapping database, and particle filtering. The result\nis a simple, real-time localization method using an image retrieval method\nwhose performance is comparable to other monocular camera localization methods\nwhich use a map built with LiDARs.\n We evaluate the proposed method using the KITTI odometry dataset and via\nclosed-loop experiments with an indoor 1:10 autonomous vehicle. The tests\ndemonstrate real-time capability and a 10cm level accuracy. Also, experimental\nresults of the closed-loop indoor tests show the presence of a positive\nfeedback loop between the localization error and the control error. Such\nphenomena is analysed in details at the end of the article.\n","authors":["Eunhyek Joa","Francesco Borrelli"],"pdf_url":"https://arxiv.org/pdf/2109.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16464v1","updated":"2023-11-28T03:55:23Z","published":"2023-11-28T03:55:23Z","title":"Bridging the Gap: A Unified Video Comprehension Framework for Moment\n Retrieval and Highlight Detection","summary":" Video Moment Retrieval (MR) and Highlight Detection (HD) have attracted\nsignificant attention due to the growing demand for video analysis. Recent\napproaches treat MR and HD as similar video grounding problems and address them\ntogether with transformer-based architecture. However, we observe that the\nemphasis of MR and HD differs, with one necessitating the perception of local\nrelationships and the other prioritizing the understanding of global contexts.\nConsequently, the lack of task-specific design will inevitably lead to\nlimitations in associating the intrinsic specialty of two tasks. To tackle the\nissue, we propose a Unified Video COMprehension framework (UVCOM) to bridge the\ngap and jointly solve MR and HD effectively. By performing progressive\nintegration on intra and inter-modality across multi-granularity, UVCOM\nachieves the comprehensive understanding in processing a video. Moreover, we\npresent multi-aspect contrastive learning to consolidate the local relation\nmodeling and global knowledge accumulation via well aligned multi-modal space.\nExtensive experiments on QVHighlights, Charades-STA, TACoS , YouTube Highlights\nand TVSum datasets demonstrate the effectiveness and rationality of UVCOM which\noutperforms the state-of-the-art methods by a remarkable margin.\n","authors":["Yicheng Xiao","Zhuoyan Luo","Yong Liu","Yue Ma","Hengwei Bian","Yatai Ji","Yujiu Yang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2311.16464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03211v2","updated":"2023-11-28T03:50:54Z","published":"2023-10-04T23:33:36Z","title":"On the Performance of Multimodal Language Models","summary":" Instruction-tuned large language models (LLMs) have demonstrated promising\nzero-shot generalization capabilities across various downstream tasks. Recent\nresearch has introduced multimodal capabilities to LLMs by integrating\nindependently pretrained vision encoders through model grafting. These\nmultimodal variants undergo instruction tuning, similar to LLMs, enabling\neffective zero-shot generalization for multimodal tasks. This study conducts a\ncomparative analysis of different multimodal instruction tuning approaches and\nevaluates their performance across a range of tasks, including complex\nreasoning, conversation, image captioning, multiple-choice questions (MCQs),\nand binary classification. Through rigorous benchmarking and ablation\nexperiments, we reveal key insights for guiding architectural choices when\nincorporating multimodal capabilities into LLMs. However, current approaches\nhave limitations; they do not sufficiently address the need for a diverse\nmultimodal instruction dataset, which is crucial for enhancing task\ngeneralization. Additionally, they overlook issues related to truthfulness and\nfactuality when generating responses. These findings illuminate current\nmethodological constraints in adapting language models for image comprehension\nand provide valuable guidance for researchers and practitioners seeking to\nharness multimodal versions of LLMs.\n","authors":["Utsav Garg","Erhan Bas"],"pdf_url":"https://arxiv.org/pdf/2310.03211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16462v1","updated":"2023-11-28T03:45:29Z","published":"2023-11-28T03:45:29Z","title":"Viewport Prediction for Volumetric Video Streaming by Exploring Video\n Saliency and Trajectory Information","summary":" Volumetric video, also known as hologram video, is a novel medium that\nportrays natural content in Virtual Reality (VR), Augmented Reality (AR), and\nMixed Reality (MR). It is expected to be the next-gen video technology and a\nprevalent use case for 5G and beyond wireless communication. Considering that\neach user typically only watches a section of the volumetric video, known as\nthe viewport, it is essential to have precise viewport prediction for optimal\nperformance. However, research on this topic is still in its infancy. In the\nend, this paper presents and proposes a novel approach, named Saliency and\nTrajectory Viewport Prediction (STVP), which aims to improve the precision of\nviewport prediction in volumetric video streaming. The STVP extensively\nutilizes video saliency information and viewport trajectory. To our knowledge,\nthis is the first comprehensive study of viewport prediction in volumetric\nvideo streaming. In particular, we introduce a novel sampling method, Uniform\nRandom Sampling (URS), to reduce computational complexity while still\npreserving video features in an efficient manner. Then we present a saliency\ndetection technique that incorporates both spatial and temporal information for\ndetecting static, dynamic geometric, and color salient regions. Finally, we\nintelligently fuse saliency and trajectory information to achieve more accurate\nviewport prediction. We conduct extensive simulations to evaluate the\neffectiveness of our proposed viewport prediction methods using\nstate-of-the-art volumetric video sequences. The experimental results show the\nsuperiority of the proposed method over existing schemes. The dataset and\nsource code will be publicly accessible after acceptance.\n","authors":["Jie Li","Zhixin Li","Zhi Liu","Pengyuan Zhou","Richang Hong","Qiyue Li","Han Hu"],"pdf_url":"https://arxiv.org/pdf/2311.16462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12532v3","updated":"2023-11-28T03:42:45Z","published":"2023-08-24T03:43:02Z","title":"FedSOL: Stabilized Orthogonal Learning in Federated Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclient data distributions are heterogeneous. Many previous FL algorithms have\naddressed this issue by introducing various proximal restrictions. These\nrestrictions aim to encourage global alignment by constraining the deviation of\nlocal learning from the global objective. However, they inherently limit local\nlearning by interfering with the original local objectives. Recently, an\nalternative approach has emerged to improve local learning generality. By\nobtaining local models within a smooth loss landscape, this approach mitigates\nconflicts among different local objectives of the clients. Yet, it does not\nensure stable global alignment, as local learning does not take the global\nobjective into account. In this study, we propose Federated Stability on\nLearning (FedSoL), which combines both the concepts of global alignment and\nlocal generality. In FedSoL, the local learning seeks a parameter region robust\nagainst proximal perturbations. This strategy introduces an implicit proximal\nrestriction effect in local learning while maintaining the original local\nobjective for parameter update. Our experiments show that FedSoL consistently\nachieves state-of-the-art performance on various setups.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v3.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2311.16456v1","updated":"2023-11-28T03:30:43Z","published":"2023-11-28T03:30:43Z","title":"Spiking Neural Networks with Dynamic Time Steps for Vision Transformers","summary":" Spiking Neural Networks (SNNs) have emerged as a popular spatio-temporal\ncomputing paradigm for complex vision tasks. Recently proposed SNN training\nalgorithms have significantly reduced the number of time steps (down to 1) for\nimproved latency and energy efficiency, however, they target only convolutional\nneural networks (CNN). These algorithms, when applied on the recently\nspotlighted vision transformers (ViT), either require a large number of time\nsteps or fail to converge. Based on analysis of the histograms of the ANN and\nSNN activation maps, we hypothesize that each ViT block has a different\nsensitivity to the number of time steps. We propose a novel training framework\nthat dynamically allocates the number of time steps to each ViT module\ndepending on a trainable score assigned to each timestep. In particular, we\ngenerate a scalar binary time step mask that filters spikes emitted by each\nneuron in a leaky-integrate-and-fire (LIF) layer. The resulting SNNs have high\nactivation sparsity and require only accumulate operations (AC), except for the\ninput embedding layer, in contrast to expensive multiply-and-accumulates (MAC)\nneeded in traditional ViTs. This yields significant improvements in energy\nefficiency. We evaluate our training framework and resulting SNNs on image\nrecognition tasks including CIFAR10, CIFAR100, and ImageNet with different ViT\narchitectures. We obtain a test accuracy of 95.97% with 4.97 time steps with\ndirect encoding on CIFAR10.\n","authors":["Gourav Datta","Zeyu Liu","Anni Li","Peter A. Beerel"],"pdf_url":"https://arxiv.org/pdf/2311.16456v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.15830v2","updated":"2023-11-28T03:15:50Z","published":"2023-11-27T13:53:53Z","title":"A-JEPA: Joint-Embedding Predictive Architecture Can Listen","summary":" This paper presents that the masked-modeling principle driving the success of\nlarge foundational vision models can be effectively applied to audio by making\npredictions in a latent space. We introduce Audio-based Joint-Embedding\nPredictive Architecture (A-JEPA), a simple extension method for self-supervised\nlearning from the audio spectrum. Following the design of I-JEPA, our A-JEPA\nencodes visible audio spectrogram patches with a curriculum masking strategy\nvia context encoder, and predicts the representations of regions sampled at\nwell-designed locations. The target representations of those regions are\nextracted by the exponential moving average of context encoder, \\emph{i.e.},\ntarget encoder, on the whole spectrogram. We find it beneficial to transfer\nrandom block masking into time-frequency aware masking in a curriculum manner,\nconsidering the complexity of highly correlated in local time and frequency in\naudio spectrograms. To enhance contextual semantic understanding and\nrobustness, we fine-tune the encoder with a regularized masking on target\ndatasets, instead of input dropping or zero. Empirically, when built with\nVision Transformers structure, we find A-JEPA to be highly scalable and sets\nnew state-of-the-art performance on multiple audio and speech classification\ntasks, outperforming other recent models that use externally supervised\npre-training.\n","authors":["Zhengcong Fei","Mingyuan Fan","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2311.15830v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16450v1","updated":"2023-11-28T03:11:33Z","published":"2023-11-28T03:11:33Z","title":"Typhoon Intensity Prediction with Vision Transformer","summary":" Predicting typhoon intensity accurately across space and time is crucial for\nissuing timely disaster warnings and facilitating emergency response. This has\nvast potential for minimizing life losses and property damages as well as\nreducing economic and environmental impacts. Leveraging satellite imagery for\nscenario analysis is effective but also introduces additional challenges due to\nthe complex relations among clouds and the highly dynamic context. Existing\ndeep learning methods in this domain rely on convolutional neural networks\n(CNNs), which suffer from limited per-layer receptive fields. This limitation\nhinders their ability to capture long-range dependencies and global contextual\nknowledge during inference. In response, we introduce a novel approach, namely\n\"Typhoon Intensity Transformer\" (Tint), which leverages self-attention\nmechanisms with global receptive fields per layer. Tint adopts a\nsequence-to-sequence feature representation learning perspective. It begins by\ncutting a given satellite image into a sequence of patches and recursively\nemploys self-attention operations to extract both local and global contextual\nrelations between all patch pairs simultaneously, thereby enhancing per-patch\nfeature representation learning. Extensive experiments on a publicly available\ntyphoon benchmark validate the efficacy of Tint in comparison with both\nstate-of-the-art deep learning and conventional meteorological methods. Our\ncode is available at https://github.com/chen-huanxin/Tint.\n","authors":["Huanxin Chen","Pengshuai Yin","Huichou Huang","Qingyao Wu","Ruirui Liu","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.16450v1.pdf","comment":"8 pages, 2 figures, accepted by Tackling Climate Change with Machine\n Learning: workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.16447v1","updated":"2023-11-28T03:04:35Z","published":"2023-11-28T03:04:35Z","title":"TopoSemiSeg: Enforcing Topological Consistency for Semi-Supervised\n Segmentation of Histopathology Images","summary":" In computational pathology, segmenting densely distributed objects like\nglands and nuclei is crucial for downstream analysis. To alleviate the burden\nof obtaining pixel-wise annotations, semi-supervised learning methods learn\nfrom large amounts of unlabeled data. Nevertheless, existing semi-supervised\nmethods overlook the topological information hidden in the unlabeled images and\nare thus prone to topological errors, e.g., missing or incorrectly\nmerged/separated glands or nuclei. To address this issue, we propose\nTopoSemiSeg, the first semi-supervised method that learns the topological\nrepresentation from unlabeled data. In particular, we propose a topology-aware\nteacher-student approach in which the teacher and student networks learn shared\ntopological representations. To achieve this, we introduce topological\nconsistency loss, which contains signal consistency and noise removal losses to\nensure the learned representation is robust and focuses on true topological\nsignals. Extensive experiments on public pathology image datasets show the\nsuperiority of our method, especially on topology-wise evaluation metrics. Code\nis available at https://github.com/Melon-Xu/TopoSemiSeg.\n","authors":["Meilong Xu","Xiaoling Hu","Saumya Gupta","Shahira Abousamra","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2311.16447v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.16446v1","updated":"2023-11-28T03:02:00Z","published":"2023-11-28T03:02:00Z","title":"Centre Stage: Centricity-based Audio-Visual Temporal Action Detection","summary":" Previous one-stage action detection approaches have modelled temporal\ndependencies using only the visual modality. In this paper, we explore\ndifferent strategies to incorporate the audio modality, using multi-scale\ncross-attention to fuse the two modalities. We also demonstrate the correlation\nbetween the distance from the timestep to the action centre and the accuracy of\nthe predicted boundaries. Thus, we propose a novel network head to estimate the\ncloseness of timesteps to the action centre, which we call the centricity\nscore. This leads to increased confidence for proposals that exhibit more\nprecise boundaries. Our method can be integrated with other one-stage\nanchor-free architectures and we demonstrate this on three recent baselines on\nthe EPIC-Kitchens-100 action detection benchmark where we achieve\nstate-of-the-art performance. Detailed ablation studies showcase the benefits\nof fusing audio and our proposed centricity scores. Code and models for our\nproposed method are publicly available at\nhttps://github.com/hanielwang/Audio-Visual-TAD.git\n","authors":["Hanyuan Wang","Majid Mirmehdi","Dima Damen","Toby Perrett"],"pdf_url":"https://arxiv.org/pdf/2311.16446v1.pdf","comment":"Accepted to VUA workshop at BMVC 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.16892v1","updated":"2023-11-28T15:43:46Z","published":"2023-11-28T15:43:46Z","title":"Enhancing Item-level Bundle Representation for Bundle Recommendation","summary":" Bundle recommendation approaches offer users a set of related items on a\nparticular topic. The current state-of-the-art (SOTA) method utilizes\ncontrastive learning to learn representations at both the bundle and item\nlevels. However, due to the inherent difference between the bundle-level and\nitem-level preferences, the item-level representations may not receive\nsufficient information from the bundle affiliations to make accurate\npredictions. In this paper, we propose a novel approach EBRec, short of\nEnhanced Bundle Recommendation, which incorporates two enhanced modules to\nexplore inherent item-level bundle representations. First, we propose to\nincorporate the bundle-user-item (B-U-I) high-order correlations to explore\nmore collaborative information, thus to enhance the previous bundle\nrepresentation that solely relies on the bundle-item affiliation information.\nSecond, we further enhance the B-U-I correlations by augmenting the observed\nuser-item interactions with interactions generated from pre-trained models,\nthus improving the item-level bundle representations. We conduct extensive\nexperiments on three public datasets, and the results justify the effectiveness\nof our approach as well as the two core modules. Codes and datasets are\navailable at https://github.com/answermycode/EBRec.\n","authors":["Xiaoyu Du","Kun Qian","Yunshan Ma","Xinguang Xiang"],"pdf_url":"https://arxiv.org/pdf/2311.16892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16878v1","updated":"2023-11-28T15:28:12Z","published":"2023-11-28T15:28:12Z","title":"Temporal Importance Factor for Loss Functions for CTR Prediction","summary":" Click-through rate (CTR) prediction is an important task for the companies to\nrecommend products which better match user preferences. User behavior in\ndigital advertising is dynamic and changes over time. It is crucial for the\ncompanies to capture the most recent trends to provide more accurate\nrecommendations for users. In CTR prediction, most models use binary\ncross-entropy loss function. However, it does not focus on the data\ndistribution shifts occurring over time. To address this problem, we propose a\nfactor for the loss functions by utilizing the sequential nature of user-item\ninteractions. This approach aims to focus on the most recent samples by\npenalizing them more through the loss function without forgetting the long-term\ninformation. Our solution is model-agnostic, and the temporal importance factor\ncan be used with different loss functions. Offline experiments in both public\nand company datasets show that the temporal importance factor for loss\nfunctions outperforms the baseline loss functions considered.\n","authors":["Ramazan Tarık Türksoy","Beyza Türkmen","Furkan Durmuş"],"pdf_url":"https://arxiv.org/pdf/2311.16878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11270v3","updated":"2023-11-28T13:57:46Z","published":"2023-10-17T13:42:32Z","title":"Graph Neural Networks for Recommendation: Reproducibility, Graph\n Topology, and Node Representation","summary":" Graph neural networks (GNNs) have gained prominence in recommendation systems\nin recent years. By representing the user-item matrix as a bipartite and\nundirected graph, GNNs have demonstrated their potential to capture short- and\nlong-distance user-item interactions, thereby learning more accurate preference\npatterns than traditional recommendation approaches. In contrast to previous\ntutorials on the same topic, this tutorial aims to present and examine three\nkey aspects that characterize GNNs for recommendation: (i) the reproducibility\nof state-of-the-art approaches, (ii) the potential impact of graph topological\ncharacteristics on the performance of these models, and (iii) strategies for\nlearning node representations when training features from scratch or utilizing\npre-trained embeddings as additional item information (e.g., multimodal\nfeatures). The goal is to provide three novel theoretical and practical\nperspectives on the field, currently subject to debate in graph learning but\nlong been overlooked in the context of recommendation systems.\n","authors":["Daniele Malitesta","Claudio Pomo","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2310.11270v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09049v2","updated":"2023-11-28T13:11:41Z","published":"2023-11-15T15:39:33Z","title":"Adapting Large Language Models by Integrating Collaborative Semantics\n for Recommendation","summary":" Recently, large language models (LLMs) have shown great potential in\nrecommender systems, either improving existing recommendation models or serving\nas the backbone. However, there exists a large semantic gap between LLMs and\nrecommender systems, since items to be recommended are often indexed by\ndiscrete identifiers (item ID) out of the LLM's vocabulary. In essence, LLMs\ncapture language semantics while recommender systems imply collaborative\nsemantics, making it difficult to sufficiently leverage the model capacity of\nLLMs for recommendation. To address this challenge, in this paper, we propose a\nnew LLM-based recommendation model called LC-Rec, which can better integrate\nlanguage and collaborative semantics for recommender systems. Our approach can\ndirectly generate items from the entire item set for recommendation, without\nrelying on candidate items. Specifically, we make two major contributions in\nour approach. For item indexing, we design a learning-based vector quantization\nmethod with uniform semantic mapping, which can assign meaningful and\nnon-conflicting IDs (called item indices) for items. For alignment tuning, we\npropose a series of specially designed tuning tasks to enhance the integration\nof collaborative semantics in LLMs. Our fine-tuning tasks enforce LLMs to\ndeeply integrate language and collaborative semantics (characterized by the\nlearned item indices), so as to achieve an effective adaptation to recommender\nsystems. Extensive experiments demonstrate the effectiveness of our method,\nshowing that our approach can outperform a number of competitive baselines\nincluding traditional recommenders and existing LLM-based recommenders. Our\ncode is available at https://github.com/RUCAIBox/LC-Rec/.\n","authors":["Bowen Zheng","Yupeng Hou","Hongyu Lu","Yu Chen","Wayne Xin Zhao","Ming Chen","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2311.09049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06985v4","updated":"2023-11-28T12:59:20Z","published":"2023-07-13T17:25:28Z","title":"Patent Documents to Engineering Design Knowledge Graphs","summary":" Aimed at supporting knowledge-intensive tasks in the design process,\npopulating design knowledge from text documents involves the extraction of\ntriples - head entity :: relationship :: tail entity or h :: r :: t that could\nbe combined into a knowledge graph representation. As relationships are largely\nchosen from ontological or common-sense alternatives, knowledge graphs built\nusing these depict an approximation or restricted view of design knowledge,\nrather than what is explicated in text document. In this article, we present a\ndata-driven approach to identify and explicate facts (h :: r :: t) from\nsentences in patent documents. We create a dataset of 44,227 sentences and\nfacts, encompassing all patent classifications while also capturing the\nvariations among patent document sections. Using this dataset, we train taggers\nthat classify tokens to: 1) identify all entities (h) and relationships (r) and\n2) specific relationships (r) for a pair of entities (h :: ___ :: t). While\nthese taggers are built upon transformer-based sequence classification models,\nwe evaluate our proposed method against edge classification approaches that use\nlinear classifiers and graph neural networks, incorporating transformer-based\ntoken embeddings and linguistic features. The simplicity and coverage of the\nproposed method enable its application to patent documents at any scale and\nvariety. Upon deploying an open-source python package, we apply our method to\npatent documents related to fan systems. From the knowledge graphs thus\nextracted, we explain how facts could be generalised to domain ontologies as\nwell as be specified to subsystem levels. We also highlight the importance of\nknowledge graph representations by retrieving and explicating the knowledge of\nkey issues in fan systems, while holding a comparative discussion against\nopinions from ChatGPT.\n","authors":["L Siddharth","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2307.06985v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16751v1","updated":"2023-11-28T12:50:40Z","published":"2023-11-28T12:50:40Z","title":"MultiCBR: Multi-view Contrastive Learning for Bundle Recommendation","summary":" Bundle recommendation seeks to recommend a bundle of related items to users\nto improve both user experience and the profits of platform. Existing bundle\nrecommendation models have progressed from capturing only user-bundle\ninteractions to the modeling of multiple relations among users, bundles and\nitems. CrossCBR, in particular, incorporates cross-view contrastive learning\ninto a two-view preference learning framework, significantly improving SOTA\nperformance. It does, however, have two limitations: 1) the two-view\nformulation does not fully exploit all the heterogeneous relations among users,\nbundles and items; and 2) the \"early contrast and late fusion\" framework is\nless effective in capturing user preference and difficult to generalize to\nmultiple views. In this paper, we present MultiCBR, a novel Multi-view\nContrastive learning framework for Bundle Recommendation. First, we devise a\nmulti-view representation learning framework capable of capturing all the\nuser-bundle, user-item and bundle-item relations, especially better utilizing\nthe bundle-item affiliations to enhance sparse bundles' representations.\nSecond, we innovatively adopt an \"early fusion and late contrast\" design that\nfirst fuses the multi-view representations before performing self-supervised\ncontrastive learning. In comparison to existing approaches, our framework\nreverses the order of fusion and contrast, introducing the following\nadvantages: 1)our framework is capable of modeling both cross-view and ego-view\npreferences, allowing us to achieve enhanced user preference modeling; and 2)\ninstead of requiring quadratic number of cross-view contrastive losses, we only\nrequire two self-supervised contrastive losses, resulting in minimal extra\ncosts. Experimental results on three public datasets indicate that our method\noutperforms SOTA methods.\n","authors":["Yunshan Ma","Yingzhi He","Xiang Wang","Yinwei Wei","Xiaoyu Du","Yuyangzi Fu","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2311.16751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16720v1","updated":"2023-11-28T12:04:19Z","published":"2023-11-28T12:04:19Z","title":"RankingGPT: Empowering Large Language Models in Text Ranking with\n Progressive Enhancement","summary":" Text ranking is a critical task in various information retrieval\napplications, and the recent success of Large Language Models (LLMs) in natural\nlanguage processing has sparked interest in their application to text ranking.\nThese methods primarily involve combining query and candidate documents and\nleveraging prompt learning to determine query-document relevance using the\nLLM's output probabilities for specific tokens or by directly generating a\nranked list of candidate documents. Although these approaches have demonstrated\npromise, a noteworthy disparity arises between the training objective of LLMs,\nwhich typically centers around next token prediction, and the objective of\nevaluating query-document relevance. To address this gap and fully leverage LLM\npotential in text ranking tasks, we propose a progressive multi-stage training\nstrategy. Firstly, we introduce a large-scale weakly supervised dataset of\nrelevance texts to enable the LLMs to acquire the ability to predict relevant\ntokens without altering their original training objective. Subsequently, we\nincorporate supervised training to further enhance LLM ranking capability. Our\nexperimental results on multiple benchmarks demonstrate the superior\nperformance of our proposed method compared to previous competitive approaches,\nboth in in-domain and out-of-domain scenarios.\n","authors":["Longhui Zhang","Yanzhao Zhang","Dingkun Long","Pengjun Xie","Meishan Zhang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.16720v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2311.16716v1","updated":"2023-11-28T12:00:06Z","published":"2023-11-28T12:00:06Z","title":"Graph Pre-training and Prompt Learning for Recommendation","summary":" GNN-based recommenders have excelled in modeling intricate user-item\ninteractions through multi-hop message passing. However, existing methods often\noverlook the dynamic nature of evolving user-item interactions, which impedes\nthe adaption to changing user preferences and distribution shifts in newly\narriving data. Thus, their scalability and performances in real-world dynamic\nenvironments are limited. In this study, we propose GraphPL, a framework that\nincorporates parameter-efficient and dynamic graph pre-training with prompt\nlearning. This novel combination empowers GNNs to effectively capture both\nlong-term user preferences and short-term behavior dynamics, enabling the\ndelivery of accurate and timely recommendations. Our GraphPL framework\naddresses the challenge of evolving user preferences by seamlessly integrating\na temporal prompt mechanism and a graph-structural prompt learning mechanism\ninto the pre-trained GNN model. The temporal prompt mechanism encodes time\ninformation on user-item interaction, allowing the model to naturally capture\ntemporal context, while the graph-structural prompt learning mechanism enables\nthe transfer of pre-trained knowledge to adapt to behavior dynamics without the\nneed for continuous incremental training. We further bring in a dynamic\nevaluation setting for recommendation to mimic real-world dynamic scenarios and\nbridge the offline-online gap to a better level. Our extensive experiments\nincluding a large-scale industrial deployment showcases the lightweight plug-in\nscalability of our GraphPL when integrated with various state-of-the-art\nrecommenders, emphasizing the advantages of GraphPL in terms of effectiveness,\nrobustness and efficiency.\n","authors":["Yuhao Yang","Lianghao Xia","Da Luo","Kangyi Lin","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.16716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14968v2","updated":"2023-11-28T11:10:27Z","published":"2023-11-25T08:59:45Z","title":"Hide Your Model: A Parameter Transmission-free Federated Recommender\n System","summary":" With the growing concerns regarding user data privacy, Federated Recommender\nSystem (FedRec) has garnered significant attention recently due to its\nprivacy-preserving capabilities. Existing FedRecs generally adhere to a\nlearning protocol in which a central server shares a global recommendation\nmodel with clients, and participants achieve collaborative learning by\nfrequently communicating the model's public parameters. Nevertheless, this\nlearning framework has two drawbacks that limit its practical usability: (1) It\nnecessitates a global-sharing recommendation model; however, in real-world\nscenarios, information related to the recommender model, including its\nalgorithm and parameters, constitutes the platforms' intellectual property.\nHence, service providers are unlikely to release such information actively. (2)\nThe communication costs of model parameter transmission are expensive since the\nmodel parameters are usually high-dimensional matrices. With the model size\nincreasing, the communication burden will be the bottleneck for such\ntraditional FedRecs.\n Given the above limitations, this paper introduces a novel parameter\ntransmission-free federated recommendation framework that balances the\nprotection between users' data privacy and platforms' model privacy, namely\nPTF-FedRec. Specifically, participants in PTF-FedRec collaboratively exchange\nknowledge by sharing their predictions within a privacy-preserving mechanism.\nThrough this way, the central server can learn a recommender model without\ndisclosing its model parameters or accessing clients' raw data, preserving both\nthe server's model privacy and users' data privacy. Besides, since clients and\nthe central server only need to communicate prediction scores which are just a\nfew real numbers, the overhead is significantly reduced compared to traditional\nFedRecs.\n","authors":["Wei Yuan","Chaoqun Yang","Liang Qu","Quoc Viet Hung Nguyen","Jianxin Li","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2311.14968v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16683v1","updated":"2023-11-28T10:55:00Z","published":"2023-11-28T10:55:00Z","title":"Hyper-Relational Knowledge Graph Neural Network for Next POI","summary":" With the advancement of mobile technology, Point of Interest (POI)\nrecommendation systems in Location-based Social Networks (LBSN) have brought\nnumerous benefits to both users and companies. Many existing works employ\nKnowledge Graph (KG) to alleviate the data sparsity issue in LBSN. These\napproaches primarily focus on modeling the pair-wise relations in LBSN to\nenrich the semantics and thereby relieve the data sparsity issue. However,\nexisting approaches seldom consider the hyper-relations in LBSN, such as the\nmobility relation (a 3-ary relation: user-POI-time). This makes the model hard\nto exploit the semantics accurately. In addition, prior works overlook the rich\nstructural information inherent in KG, which consists of higher-order relations\nand can further alleviate the impact of data sparsity.To this end, we propose a\nHyper-Relational Knowledge Graph Neural Network (HKGNN) model. In HKGNN, a\nHyper-Relational Knowledge Graph (HKG) that models the LBSN data is constructed\nto maintain and exploit the rich semantics of hyper-relations. Then we proposed\na Hypergraph Neural Network to utilize the structural information of HKG in a\ncohesive way. In addition, a self-attention network is used to leverage\nsequential information and make personalized recommendations. Furthermore, side\ninformation, essential in reducing data sparsity by providing background\nknowledge of POIs, is not fully utilized in current methods. In light of this,\nwe extended the current dataset with available side information to further\nlessen the impact of data sparsity. Results of experiments on four real-world\nLBSN datasets demonstrate the effectiveness of our approach compared to\nexisting state-of-the-art methods.\n","authors":["Jixiao Zhang","Yongkang Li","Ruotong Zou","Jingyuan Zhang","Zipei Fan","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2311.16683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16603v1","updated":"2023-11-28T08:44:01Z","published":"2023-11-28T08:44:01Z","title":"l2Match: Optimization Techniques on Subgraph Matching Algorithm using\n Label Pair, Neighboring Label Index, and Jump-Redo method","summary":" Graph database is designed to store bidirectional relationships between\nobjects and facilitate the traversal process to extract a subgraph. However,\nthe subgraph matching process is an NP-Complete problem. Existing solutions to\nthis problem usually employ a filter-and-verification framework and a\ndivide-and-conquer method. The filter-and-verification framework minimizes the\nnumber of inputs to the verification stage by filtering and pruning invalid\ncandidates as much as possible. Meanwhile, subgraph matching is performed on\nthe substructure decomposed from the larger graph to yield partial embedding.\nSubsequently, the recursive traversal or set intersection technique combines\nthe partial embedding into a complete subgraph. In this paper, we first present\na comprehensive literature review of the state-of-the-art solutions. l2Match, a\nsubgraph isomorphism algorithm for small queries utilizing a Label-Pair Index\nand filtering method, is then proposed and presented as a proof of concept.\nEmpirical experimentation shows that l2Match outperforms related\nstate-of-the-art solutions, and the proposed methods optimize the existing\nalgorithms.\n","authors":["C. Q. Cheng","K. S. Wong","L. K. Soon"],"pdf_url":"https://arxiv.org/pdf/2311.16603v1.pdf","comment":"This short version of this article (6 pages) is accepted by ICEIC\n 2024"},{"id":"http://arxiv.org/abs/2311.16586v1","updated":"2023-11-28T08:03:56Z","published":"2023-11-28T08:03:56Z","title":"SARDINE: A Simulator for Automated Recommendation in Dynamic and\n Interactive Environments","summary":" Simulators can provide valuable insights for researchers and practitioners\nwho wish to improve recommender systems, because they allow one to easily tweak\nthe experimental setup in which recommender systems operate, and as a result\nlower the cost of identifying general trends and uncovering novel findings\nabout the candidate methods. A key requirement to enable this accelerated\nimprovement cycle is that the simulator is able to span the various sources of\ncomplexity that can be found in the real recommendation environment that it\nsimulates.\n With the emergence of interactive and data-driven methods - e.g.,\nreinforcement learning or online and counterfactual learning-to-rank - that aim\nto achieve user-related goals beyond the traditional accuracy-centric\nobjectives, adequate simulators are needed. In particular, such simulators must\nmodel the various mechanisms that render the recommendation environment dynamic\nand interactive, e.g., the effect of recommendations on the user or the effect\nof biased data on subsequent iterations of the recommender system. We therefore\npropose SARDINE, a flexible and interpretable recommendation simulator that can\nhelp accelerate research in interactive and data-driven recommender systems. We\ndemonstrate its usefulness by studying existing methods within nine diverse\nenvironments derived from SARDINE, and even uncover novel insights about them.\n","authors":["Romain Deffayet","Thibaut Thonet","Dongyoon Hwang","Vassilissa Lehoux","Jean-Michel Renders","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2311.16586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16441v1","updated":"2023-11-28T02:43:02Z","published":"2023-11-28T02:43:02Z","title":"ControlRec: Bridging the Semantic Gap between Language Model and\n Personalized Recommendation","summary":" The successful integration of large language models (LLMs) into\nrecommendation systems has proven to be a major breakthrough in recent studies,\npaving the way for more generic and transferable recommendations. However, LLMs\nstruggle to effectively utilize user and item IDs, which are crucial\nidentifiers for successful recommendations. This is mainly due to their\ndistinct representation in a semantic space that is different from the natural\nlanguage (NL) typically used to train LLMs. To tackle such issue, we introduce\nControlRec, an innovative Contrastive prompt learning framework for\nRecommendation systems. ControlRec treats user IDs and NL as heterogeneous\nfeatures and encodes them individually. To promote greater alignment and\nintegration between them in the semantic space, we have devised two auxiliary\ncontrastive objectives: (1) Heterogeneous Feature Matching (HFM) aligning item\ndescription with the corresponding ID or user's next preferred ID based on\ntheir interaction sequence, and (2) Instruction Contrastive Learning (ICL)\neffectively merging these two crucial data sources by contrasting probability\ndistributions of output sequences generated by diverse tasks. Experimental\nresults on four public real-world datasets demonstrate the effectiveness of the\nproposed method on improving model performance.\n","authors":["Junyan Qiu","Haitao Wang","Zhaolin Hong","Yiping Yang","Qiang Liu","Xingxing Wang"],"pdf_url":"https://arxiv.org/pdf/2311.16441v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.10776v2","updated":"2023-11-28T02:21:40Z","published":"2023-11-16T01:21:33Z","title":"Towards an Automatic AI Agent for Reaction Condition Recommendation in\n Chemical Synthesis","summary":" Artificial intelligence (AI) for reaction condition optimization has become\nan important topic in the pharmaceutical industry, given that a data-driven AI\nmodel can assist drug discovery and accelerate reaction design. However,\nexisting AI models lack the chemical insights and real-time knowledge\nacquisition abilities of experienced human chemists. This paper proposes a\nLarge Language Model (LLM) empowered AI agent to bridge this gap. We put forth\na novel three-phase paradigm and applied advanced intelligence-enhancement\nmethods like in-context learning and multi-LLM debate so that the AI agent can\nborrow human insight and update its knowledge by searching the latest chemical\nliterature. Additionally, we introduce a novel Coarse-label Contrastive\nLearning (CCL) based chemical fingerprint that greatly enhances the agent's\nperformance in optimizing the reaction condition. With the above efforts, the\nproposed AI agent can autonomously generate the optimal reaction condition\nrecommendation without any human interaction. Further, the agent is highly\nprofessional in terms of chemical reactions. It demonstrates close-to-human\nperformance and strong generalization capability in both dry-lab and wet-lab\nexperiments. As the first attempt in the chemical AI agent, this work goes a\nstep further in the field of \"AI for chemistry\" and opens up new possibilities\nfor computer-aided synthesis planning.\n","authors":["Kexin Chen","Junyou Li","Kunyi Wang","Yuyang Du","Jiahui Yu","Jiamin Lu","Lanqing Li","Jiezhong Qiu","Qun Fang","Pheng Ann Heng","Guangyong Chen"],"pdf_url":"https://arxiv.org/pdf/2311.10776v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17136v1","updated":"2023-11-28T18:55:52Z","published":"2023-11-28T18:55:52Z","title":"UniIR: Training and Benchmarking Universal Multimodal Information\n Retrievers","summary":" Existing information retrieval (IR) models often assume a homogeneous format,\nlimiting their applicability to diverse user needs, such as searching for\nimages with text descriptions, searching for a news article with a headline\nimage, or finding a similar photo with a query image. To approach such\ndifferent information-seeking demands, we introduce UniIR, a unified\ninstruction-guided multimodal retriever capable of handling eight distinct\nretrieval tasks across modalities. UniIR, a single retrieval system jointly\ntrained on ten diverse multimodal-IR datasets, interprets user instructions to\nexecute various retrieval tasks, demonstrating robust performance across\nexisting datasets and zero-shot generalization to new tasks. Our experiments\nhighlight that multi-task training and instruction tuning are keys to UniIR's\ngeneralization ability. Additionally, we construct the M-BEIR, a multimodal\nretrieval benchmark with comprehensive results, to standardize the evaluation\nof universal multimodal information retrieval.\n","authors":["Cong Wei","Yang Chen","Haonan Chen","Hexiang Hu","Ge Zhang","Jie Fu","Alan Ritter","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17136v1.pdf","comment":"Our code and dataset are available on this project page:\n https://tiger-ai-lab.github.io/UniIR/"},{"id":"http://arxiv.org/abs/2311.17107v1","updated":"2023-11-28T10:26:57Z","published":"2023-11-28T10:26:57Z","title":"ClimateX: Do LLMs Accurately Assess Human Expert Confidence in Climate\n Statements?","summary":" Evaluating the accuracy of outputs generated by Large Language Models (LLMs)\nis especially important in the climate science and policy domain. We introduce\nthe Expert Confidence in Climate Statements (ClimateX) dataset, a novel,\ncurated, expert-labeled dataset consisting of 8094 climate statements collected\nfrom the latest Intergovernmental Panel on Climate Change (IPCC) reports,\nlabeled with their associated confidence levels. Using this dataset, we show\nthat recent LLMs can classify human expert confidence in climate-related\nstatements, especially in a few-shot learning setting, but with limited (up to\n47%) accuracy. Overall, models exhibit consistent and significant\nover-confidence on low and medium confidence statements. We highlight\nimplications of our results for climate communication, LLMs evaluation\nstrategies, and the use of LLMs in information retrieval systems.\n","authors":["Romain Lacombe","Kerrie Wu","Eddie Dilworth"],"pdf_url":"https://arxiv.org/pdf/2311.17107v1.pdf","comment":"Tackling Climate Change with Machine Learning workshop at NeurIPS\n 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.17059v1","updated":"2023-11-28T18:59:58Z","published":"2023-11-28T18:59:58Z","title":"Mission-driven Exploration for Accelerated Deep Reinforcement Learning\n with Temporal Logic Task Specifications","summary":" This paper addresses the problem of designing optimal control policies for\nmobile robots with mission and safety requirements specified using Linear\nTemporal Logic (LTL). We consider robots with unknown stochastic dynamics\noperating in environments with unknown geometric structure. The robots are\nequipped with sensors allowing them to detect obstacles. Our goal is to\nsynthesize a control policy that maximizes the probability of satisfying an\nLTL-encoded task in the presence of motion and environmental uncertainty.\nSeveral deep reinforcement learning (DRL) algorithms have been proposed\nrecently to address similar problems. A common limitation in related works is\nthat of slow learning performance. In order to address this issue, we propose a\nnovel DRL algorithm, which has the capability to learn control policies at a\nnotably faster rate compared to similar methods. Its sample efficiency is due\nto a mission-driven exploration strategy that prioritizes exploration towards\ndirections that may contribute to mission accomplishment. Identifying these\ndirections relies on an automaton representation of the LTL task as well as a\nlearned neural network that (partially) models the unknown system dynamics. We\nprovide comparative experiments demonstrating the efficiency of our algorithm\non robot navigation tasks in unknown environments.\n","authors":["Jun Wang","Hosein Hasanbeig","Kaiyuan Tan","Zihe Sun","Yiannis Kantaros"],"pdf_url":"https://arxiv.org/pdf/2311.17059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17055v1","updated":"2023-11-28T18:59:46Z","published":"2023-11-28T18:59:46Z","title":"No Representation Rules Them All in Category Discovery","summary":" In this paper we tackle the problem of Generalized Category Discovery (GCD).\nSpecifically, given a dataset with labelled and unlabelled images, the task is\nto cluster all images in the unlabelled subset, whether or not they belong to\nthe labelled categories. Our first contribution is to recognize that most\nexisting GCD benchmarks only contain labels for a single clustering of the\ndata, making it difficult to ascertain whether models are using the available\nlabels to solve the GCD task, or simply solving an unsupervised clustering\nproblem. As such, we present a synthetic dataset, named 'Clevr-4', for category\ndiscovery. Clevr-4 contains four equally valid partitions of the data, i.e\nbased on object shape, texture, color or count. To solve the task, models are\nrequired to extrapolate the taxonomy specified by the labelled set, rather than\nsimply latching onto a single natural grouping of the data. We use this dataset\nto demonstrate the limitations of unsupervised clustering in the GCD setting,\nshowing that even very strong unsupervised models fail on Clevr-4. We further\nuse Clevr-4 to examine the weaknesses of existing GCD algorithms, and propose a\nnew method which addresses these shortcomings, leveraging consistent findings\nfrom the representation learning literature to do so. Our simple solution,\nwhich is based on 'mean teachers' and termed $\\mu$GCD, substantially\noutperforms implemented baselines on Clevr-4. Finally, when we transfer these\nfindings to real data on the challenging Semantic Shift Benchmark (SSB), we\nfind that $\\mu$GCD outperforms all prior work, setting a new state-of-the-art.\nFor the project webpage, see https://www.robots.ox.ac.uk/~vgg/data/clevr4/\n","authors":["Sagar Vaze","Andrea Vedaldi","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2311.17055v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.01291v2","updated":"2023-11-28T18:59:31Z","published":"2023-09-03T23:47:21Z","title":"Generative Social Choice","summary":" Traditionally, social choice theory has only been applicable to choices among\na few predetermined alternatives but not to more complex decisions such as\ncollectively selecting a textual statement. We introduce generative social\nchoice, a framework that combines the mathematical rigor of social choice\ntheory with the capability of large language models to generate text and\nextrapolate preferences. This framework divides the design of AI-augmented\ndemocratic processes into two components: first, proving that the process\nsatisfies rigorous representation guarantees when given access to oracle\nqueries; second, empirically validating that these queries can be approximately\nimplemented using a large language model. We apply this framework to the\nproblem of generating a slate of statements that is representative of opinions\nexpressed as free-form text; specifically, we develop a democratic process with\nrepresentation guarantees and use this process to represent the opinions of\nparticipants in a survey about chatbot personalization. We find that 93 out of\n100 participants feel \"mostly\" or \"perfectly\" represented by the slate of five\nstatements we extracted.\n","authors":["Sara Fish","Paul Gölz","David C. Parkes","Ariel D. Procaccia","Gili Rusak","Itai Shapira","Manuel Wüthrich"],"pdf_url":"https://arxiv.org/pdf/2309.01291v2.pdf","comment":"Substantially revised with non-approval utility model, new\n representation axiom (balanced justified representation), and real-world case\n study"},{"id":"http://arxiv.org/abs/2311.17053v1","updated":"2023-11-28T18:58:48Z","published":"2023-11-28T18:58:48Z","title":"DiffuseBot: Breeding Soft Robots With Physics-Augmented Generative\n Diffusion Models","summary":" Nature evolves creatures with a high complexity of morphological and\nbehavioral intelligence, meanwhile computational methods lag in approaching\nthat diversity and efficacy. Co-optimization of artificial creatures'\nmorphology and control in silico shows promise for applications in physical\nsoft robotics and virtual character creation; such approaches, however, require\ndeveloping new learning algorithms that can reason about function atop pure\nstructure. In this paper, we present DiffuseBot, a physics-augmented diffusion\nmodel that generates soft robot morphologies capable of excelling in a wide\nspectrum of tasks. DiffuseBot bridges the gap between virtually generated\ncontent and physical utility by (i) augmenting the diffusion process with a\nphysical dynamical simulation which provides a certificate of performance, and\n(ii) introducing a co-design procedure that jointly optimizes physical design\nand control by leveraging information about physical sensitivities from\ndifferentiable simulation. We showcase a range of simulated and fabricated\nrobots along with their capabilities. Check our website at\nhttps://diffusebot.github.io/\n","authors":["Tsun-Hsuan Wang","Juntian Zheng","Pingchuan Ma","Yilun Du","Byungchul Kim","Andrew Spielberg","Joshua Tenenbaum","Chuang Gan","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2311.17053v1.pdf","comment":"NeurIPS 2023. Project page: https://diffusebot.github.io/"},{"id":"http://arxiv.org/abs/2311.17049v1","updated":"2023-11-28T18:55:42Z","published":"2023-11-28T18:55:42Z","title":"MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced\n Training","summary":" Contrastive pretraining of image-text foundation models, such as CLIP,\ndemonstrated excellent zero-shot performance and improved robustness on a wide\nrange of downstream tasks. However, these models utilize large\ntransformer-based encoders with significant memory and latency overhead which\npose challenges for deployment on mobile devices. In this work, we introduce\nMobileCLIP -- a new family of efficient image-text models optimized for runtime\nperformance along with a novel and efficient training approach, namely\nmulti-modal reinforced training. The proposed training approach leverages\nknowledge transfer from an image captioning model and an ensemble of strong\nCLIP encoders to improve the accuracy of efficient models. Our approach avoids\ntrain-time compute overhead by storing the additional knowledge in a reinforced\ndataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for\nzero-shot classification and retrieval tasks on several datasets. Our\nMobileCLIP-S2 variant is 2.3$\\times$ faster while more accurate compared to\nprevious best CLIP model based on ViT-B/16. We further demonstrate the\neffectiveness of our multi-modal reinforced training by training a CLIP model\nbased on ViT-B/16 image backbone and achieving +2.9% average performance\nimprovement on 38 evaluation benchmarks compared to the previous best.\nMoreover, we show that the proposed approach achieves 10$\\times$-1000$\\times$\nimproved learning efficiency when compared with non-reinforced CLIP training.\n","authors":["Pavan Kumar Anasosalu Vasu","Hadi Pouransari","Fartash Faghri","Raviteja Vemulapalli","Oncel Tuzel"],"pdf_url":"https://arxiv.org/pdf/2311.17049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17035v1","updated":"2023-11-28T18:47:03Z","published":"2023-11-28T18:47:03Z","title":"Scalable Extraction of Training Data from (Production) Language Models","summary":" This paper studies extractable memorization: training data that an adversary\ncan efficiently extract by querying a machine learning model without prior\nknowledge of the training dataset. We show an adversary can extract gigabytes\nof training data from open-source language models like Pythia or GPT-Neo,\nsemi-open models like LLaMA or Falcon, and closed models like ChatGPT. Existing\ntechniques from the literature suffice to attack unaligned models; in order to\nattack the aligned ChatGPT, we develop a new divergence attack that causes the\nmodel to diverge from its chatbot-style generations and emit training data at a\nrate 150x higher than when behaving properly. Our methods show practical\nattacks can recover far more data than previously thought, and reveal that\ncurrent alignment techniques do not eliminate memorization.\n","authors":["Milad Nasr","Nicholas Carlini","Jonathan Hayase","Matthew Jagielski","A. Feder Cooper","Daphne Ippolito","Christopher A. Choquette-Choo","Eric Wallace","Florian Tramèr","Katherine Lee"],"pdf_url":"https://arxiv.org/pdf/2311.17035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11876v2","updated":"2023-11-28T18:36:13Z","published":"2023-11-20T16:12:34Z","title":"Forward Gradients for Data-Driven CFD Wall Modeling","summary":" Computational Fluid Dynamics (CFD) is used in the design and optimization of\ngas turbines and many other industrial/ scientific applications. However, the\npractical use is often limited by the high computational cost, and the accurate\nresolution of near-wall flow is a significant contributor to this cost. Machine\nlearning (ML) and other data-driven methods can complement existing wall\nmodels. Nevertheless, training these models is bottlenecked by the large\ncomputational effort and memory footprint demanded by back-propagation. Recent\nwork has presented alternatives for computing gradients of neural networks\nwhere a separate forward and backward sweep is not needed and storage of\nintermediate results between sweeps is not required because an unbiased\nestimator for the gradient is computed in a single forward sweep. In this\npaper, we discuss the application of this approach for training a subgrid wall\nmodel that could potentially be used as a surrogate in wall-bounded flow CFD\nsimulations to reduce the computational overhead while preserving predictive\naccuracy.\n","authors":["Jan Hückelheim","Tadbhagya Kumar","Krishnan Raghavan","Pinaki Pal"],"pdf_url":"https://arxiv.org/pdf/2311.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10498v3","updated":"2023-11-28T18:33:37Z","published":"2023-05-17T18:06:43Z","title":"Edge Directionality Improves Learning on Heterophilic Graphs","summary":" Graph Neural Networks (GNNs) have become the de-facto standard tool for\nmodeling relational data. However, while many real-world graphs are directed,\nthe majority of today's GNN models discard this information altogether by\nsimply making the graph undirected. The reasons for this are historical: 1)\nmany early variants of spectral GNNs explicitly required undirected graphs, and\n2) the first benchmarks on homophilic graphs did not find significant gain from\nusing direction. In this paper, we show that in heterophilic settings, treating\nthe graph as directed increases the effective homophily of the graph,\nsuggesting a potential gain from the correct use of directionality information.\nTo this end, we introduce Directed Graph Neural Network (Dir-GNN), a novel\ngeneral framework for deep learning on directed graphs. Dir-GNN can be used to\nextend any Message Passing Neural Network (MPNN) to account for edge\ndirectionality information by performing separate aggregations of the incoming\nand outgoing edges. We prove that Dir-GNN matches the expressivity of the\nDirected Weisfeiler-Lehman test, exceeding that of conventional MPNNs. In\nextensive experiments, we validate that while our framework leaves performance\nunchanged on homophilic datasets, it leads to large gains over base models such\nas GCN, GAT and GraphSage on heterophilic benchmarks, outperforming much more\ncomplex methods and achieving new state-of-the-art results.\n","authors":["Emanuele Rossi","Bertrand Charpentier","Francesco Di Giovanni","Fabrizio Frasca","Stephan Günnemann","Michael Bronstein"],"pdf_url":"https://arxiv.org/pdf/2305.10498v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17030v1","updated":"2023-11-28T18:32:19Z","published":"2023-11-28T18:32:19Z","title":"Is This the Subspace You Are Looking for? An Interpretability Illusion\n for Subspace Activation Patching","summary":" Mechanistic interpretability aims to understand model behaviors in terms of\nspecific, interpretable features, often hypothesized to manifest as\nlow-dimensional subspaces of activations. Specifically, recent studies have\nexplored subspace interventions (such as activation patching) as a way to\nsimultaneously manipulate model behavior and attribute the features behind it\nto given subspaces.\n In this work, we demonstrate that these two aims diverge, potentially leading\nto an illusory sense of interpretability. Counterintuitively, even if a\nsubspace intervention makes the model's output behave as if the value of a\nfeature was changed, this effect may be achieved by activating a dormant\nparallel pathway leveraging another subspace that is causally disconnected from\nmodel outputs. We demonstrate this phenomenon in a distilled mathematical\nexample, in two real-world domains (the indirect object identification task and\nfactual recall), and present evidence for its prevalence in practice. In the\ncontext of factual recall, we further show a link to rank-1 fact editing,\nproviding a mechanistic explanation for previous work observing an\ninconsistency between fact editing performance and fact localization.\n However, this does not imply that activation patching of subspaces is\nintrinsically unfit for interpretability. To contextualize our findings, we\nalso show what a success case looks like in a task (indirect object\nidentification) where prior manual circuit analysis informs an understanding of\nthe location of a feature. We explore the additional evidence needed to argue\nthat a patched subspace is faithful.\n","authors":["Aleksandar Makelov","Georg Lange","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2311.17030v1.pdf","comment":"NeurIPS 2023 Workshop on Attributing Model Behavior at Scale"},{"id":"http://arxiv.org/abs/2311.09312v2","updated":"2023-11-28T18:31:07Z","published":"2023-11-15T19:12:47Z","title":"H-Packer: Holographic Rotationally Equivariant Convolutional Neural\n Network for Protein Side-Chain Packing","summary":" Accurately modeling protein 3D structure is essential for the design of\nfunctional proteins. An important sub-task of structure modeling is protein\nside-chain packing: predicting the conformation of side-chains (rotamers) given\nthe protein's backbone structure and amino-acid sequence. Conventional\napproaches for this task rely on expensive sampling procedures over\nhand-crafted energy functions and rotamer libraries. Recently, several deep\nlearning methods have been developed to tackle the problem in a data-driven\nway, albeit with vastly different formulations (from image-to-image translation\nto directly predicting atomic coordinates). Here, we frame the problem as a\njoint regression over the side-chains' true degrees of freedom: the dihedral\n$\\chi$ angles. We carefully study possible objective functions for this task,\nwhile accounting for the underlying symmetries of the task. We propose\nHolographic Packer (H-Packer), a novel two-stage algorithm for side-chain\npacking built on top of two light-weight rotationally equivariant neural\nnetworks. We evaluate our method on CASP13 and CASP14 targets. H-Packer is\ncomputationally efficient and shows favorable performance against conventional\nphysics-based algorithms and is competitive against alternative deep learning\nsolutions.\n","authors":["Gian Marco Visani","William Galvin","Michael Neal Pun","Armita Nourmohammad"],"pdf_url":"https://arxiv.org/pdf/2311.09312v2.pdf","comment":"Accepted as a conference paper at MLCB 2023. 8 pages main body, 20\n pages with appendix. 10 figures"},{"id":"http://arxiv.org/abs/2311.12878v2","updated":"2023-11-28T18:29:09Z","published":"2023-11-20T17:59:30Z","title":"Adaptive Bayesian Learning with Action and State-Dependent Signal\n Variance","summary":" This manuscript presents an advanced framework for Bayesian learning by\nincorporating action and state-dependent signal variances into decision-making\nmodels. This framework is pivotal in understanding complex data-feedback loops\nand decision-making processes in various economic systems. Through a series of\nexamples, we demonstrate the versatility of this approach in different\ncontexts, ranging from simple Bayesian updating in stable environments to\ncomplex models involving social learning and state-dependent uncertainties. The\npaper uniquely contributes to the understanding of the nuanced interplay\nbetween data, actions, outcomes, and the inherent uncertainty in economic\nmodels.\n","authors":["Kaiwen Hou"],"pdf_url":"https://arxiv.org/pdf/2311.12878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17026v1","updated":"2023-11-28T18:28:03Z","published":"2023-11-28T18:28:03Z","title":"When the Few Outweigh the Many: Illicit Content Recognition with\n Few-Shot Learning","summary":" The anonymity and untraceability benefits of the Dark web account for the\nexponentially-increased potential of its popularity while creating a suitable\nwomb for many illicit activities, to date. Hence, in collaboration with\ncybersecurity and law enforcement agencies, research has provided approaches\nfor recognizing and classifying illicit activities with most exploiting textual\ndark web markets' content recognition; few such approaches use images that\noriginated from dark web content. This paper investigates this alternative\ntechnique for recognizing illegal activities from images. In particular, we\ninvestigate label-agnostic learning techniques like One-Shot and Few-Shot\nlearning featuring the use Siamese neural networks, a state-of-the-art approach\nin the field. Our solution manages to handle small-scale datasets with\npromising accuracy. In particular, Siamese neural networks reach 90.9% on\n20-Shot experiments over a 10-class dataset; this leads us to conclude that\nsuch models are a promising and cheaper alternative to the definition of\nautomated law-enforcing machinery over the dark web.\n","authors":["G. Cascavilla","G. Catolino","M. Conti","D. Mellios","D. A. Tamburri"],"pdf_url":"https://arxiv.org/pdf/2311.17026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15936v2","updated":"2023-11-28T18:22:44Z","published":"2023-11-27T15:45:02Z","title":"Towards Responsible Governance of Biological Design Tools","summary":" Recent advancements in generative machine learning have enabled rapid\nprogress in biological design tools (BDTs) such as protein structure and\nsequence prediction models. The unprecedented predictive accuracy and novel\ndesign capabilities of BDTs present new and significant dual-use risks. For\nexample, their predictive accuracy allows biological agents, whether vaccines\nor pathogens, to be developed more quickly, while the design capabilities could\nbe used to discover drugs or evade DNA screening techniques. Similar to other\ndual-use AI systems, BDTs present a wicked problem: how can regulators uphold\npublic safety without stifling innovation? We highlight how current regulatory\nproposals that are primarily tailored toward large language models may be less\neffective for BDTs, which require fewer computational resources to train and\nare often developed in an open-source manner. We propose a range of measures to\nmitigate the risk that BDTs are misused, across the areas of responsible\ndevelopment, risk assessment, transparency, access management, cybersecurity,\nand investing in resilience. Implementing such measures will require close\ncoordination between developers and governments.\n","authors":["Richard Moulange","Max Langenkamp","Tessa Alexanian","Samuel Curtis","Morgan Livingston"],"pdf_url":"https://arxiv.org/pdf/2311.15936v2.pdf","comment":"10 pages + references, 1 figure, accepted at NeurIPS 2023 Workshop on\n Regulatable ML as oral presentation"},{"id":"http://arxiv.org/abs/2311.17007v1","updated":"2023-11-28T18:02:06Z","published":"2023-11-28T18:02:06Z","title":"Computational Hypergraph Discovery, a Gaussian Process framework for\n connecting the dots","summary":" Most scientific challenges can be framed into one of the following three\nlevels of complexity of function approximation. Type 1: Approximate an unknown\nfunction given input/output data. Type 2: Consider a collection of variables\nand functions, some of which are unknown, indexed by the nodes and hyperedges\nof a hypergraph (a generalized graph where edges can connect more than two\nvertices). Given partial observations of the variables of the hypergraph\n(satisfying the functional dependencies imposed by its structure), approximate\nall the unobserved variables and unknown functions. Type 3: Expanding on Type\n2, if the hypergraph structure itself is unknown, use partial observations of\nthe variables of the hypergraph to discover its structure and approximate its\nunknown functions. While most Computational Science and Engineering and\nScientific Machine Learning challenges can be framed as Type 1 and Type 2\nproblems, many scientific problems can only be categorized as Type 3. Despite\ntheir prevalence, these Type 3 challenges have been largely overlooked due to\ntheir inherent complexity. Although Gaussian Process (GP) methods are sometimes\nperceived as well-founded but old technology limited to Type 1 curve fitting,\ntheir scope has recently been expanded to Type 2 problems. In this paper, we\nintroduce an interpretable GP framework for Type 3 problems, targeting the\ndata-driven discovery and completion of computational hypergraphs. Our approach\nis based on a kernel generalization of Row Echelon Form reduction from linear\nsystems to nonlinear ones and variance-based analysis. Here, variables are\nlinked via GPs and those contributing to the highest data variance unveil the\nhypergraph's structure. We illustrate the scope and efficiency of the proposed\napproach with applications to (algebraic) equation discovery, network discovery\n(gene pathways, chemical, and mechanical) and raw data analysis.\n","authors":["Théo Bourdais","Pau Batlle","Xianjin Yang","Ricardo Baptista","Nicolas Rouquette","Houman Owhadi"],"pdf_url":"https://arxiv.org/pdf/2311.17007v1.pdf","comment":"The code for the algorithm introduced in this paper and its\n application to various examples are available for download (and as as an\n installable python library/package) at\n https://github.com/TheoBourdais/ComputationalHypergraphDiscovery"},{"id":"http://arxiv.org/abs/2311.17008v1","updated":"2023-11-28T18:02:06Z","published":"2023-11-28T18:02:06Z","title":"An Investigation of Time Reversal Symmetry in Reinforcement Learning","summary":" One of the fundamental challenges associated with reinforcement learning (RL)\nis that collecting sufficient data can be both time-consuming and expensive. In\nthis paper, we formalize a concept of time reversal symmetry in a Markov\ndecision process (MDP), which builds upon the established structure of\ndynamically reversible Markov chains (DRMCs) and time-reversibility in\nclassical physics. Specifically, we investigate the utility of this concept in\nreducing the sample complexity of reinforcement learning. We observe that\nutilizing the structure of time reversal in an MDP allows every environment\ntransition experienced by an agent to be transformed into a feasible\nreverse-time transition, effectively doubling the number of experiences in the\nenvironment. To test the usefulness of this newly synthesized data, we develop\na novel approach called time symmetric data augmentation (TSDA) and investigate\nits application in both proprioceptive and pixel-based state within the realm\nof off-policy, model-free RL. Empirical evaluations showcase how these\nsynthetic transitions can enhance the sample efficiency of RL agents in time\nreversible scenarios without friction or contact. We also test this method in\nmore realistic environments where these assumptions are not globally satisfied.\nWe find that TSDA can significantly degrade sample efficiency and policy\nperformance, but can also improve sample efficiency under the right conditions.\nUltimately we conclude that time symmetry shows promise in enhancing the sample\nefficiency of reinforcement learning and provide guidance when the environment\nand reward structures are of an appropriate form for TSDA to be employed\neffectively.\n","authors":["Brett Barkley","Amy Zhang","David Fridovich-Keil"],"pdf_url":"https://arxiv.org/pdf/2311.17008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17006v1","updated":"2023-11-28T17:59:49Z","published":"2023-11-28T17:59:49Z","title":"On the Impact of Sampling on Deep Sequential State Estimation","summary":" State inference and parameter learning in sequential models can be\nsuccessfully performed with approximation techniques that maximize the evidence\nlower bound to the marginal log-likelihood of the data distribution. These\nmethods may be referred to as Dynamical Variational Autoencoders, and our\nspecific focus lies on the deep Kalman filter. It has been shown that the ELBO\nobjective can oversimplify data representations, potentially compromising\nestimation quality. Tighter Monte Carlo objectives have been proposed in the\nliterature to enhance generative modeling performance. For instance, the IWAE\nobjective uses importance weights to reduce the variance of marginal\nlog-likelihood estimates. In this paper, importance sampling is applied to the\nDKF framework for learning deep Markov models, resulting in the IW-DKF, which\nshows an improvement in terms of log-likelihood estimates and KL divergence\nbetween the variational distribution and the transition model. The framework\nusing the sampled DKF update rule is also accommodated to address sequential\nstate and parameter estimation when working with highly non-linear\nphysics-based models. An experiment with the 3-space Lorenz attractor shows an\nenhanced generative modeling performance and also a decrease in RMSE when\nestimating the model parameters and latent states, indicating that tighter MCOs\nlead to improved state inference performance.\n","authors":["Helena Calatrava","Ricardo Augusto Borsoi","Tales Imbiriba","Pau Closas"],"pdf_url":"https://arxiv.org/pdf/2311.17006v1.pdf","comment":"To appear in the Proceedings of the Asilomar Conference on Signals,\n Systems, and Computers, October 2023, 5 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.16996v1","updated":"2023-11-28T17:48:18Z","published":"2023-11-28T17:48:18Z","title":"Goal-conditioned Offline Planning from Curious Exploration","summary":" Curiosity has established itself as a powerful exploration strategy in deep\nreinforcement learning. Notably, leveraging expected future novelty as\nintrinsic motivation has been shown to efficiently generate exploratory\ntrajectories, as well as a robust dynamics model. We consider the challenge of\nextracting goal-conditioned behavior from the products of such unsupervised\nexploration techniques, without any additional environment interaction. We find\nthat conventional goal-conditioned reinforcement learning approaches for\nextracting a value function and policy fall short in this difficult offline\nsetting. By analyzing the geometry of optimal goal-conditioned value functions,\nwe relate this issue to a specific class of estimation artifacts in learned\nvalues. In order to mitigate their occurrence, we propose to combine\nmodel-based planning over learned value landscapes with a graph-based value\naggregation scheme. We show how this combination can correct both local and\nglobal artifacts, obtaining significant improvements in zero-shot goal-reaching\nperformance across diverse simulated environments.\n","authors":["Marco Bagatella","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2311.16996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16984v1","updated":"2023-11-28T17:35:38Z","published":"2023-11-28T17:35:38Z","title":"FedECA: A Federated External Control Arm Method for Causal Inference\n with Time-To-Event Data in Distributed Settings","summary":" External control arms (ECA) can inform the early clinical development of\nexperimental drugs and provide efficacy evidence for regulatory approval in\nnon-randomized settings. However, the main challenge of implementing ECA lies\nin accessing real-world data or historical clinical trials. Indeed, data\nsharing is often not feasible due to privacy considerations related to data\nleaving the original collection centers, along with pharmaceutical companies'\ncompetitive motives. In this paper, we leverage a privacy-enhancing technology\ncalled federated learning (FL) to remove some of the barriers to data sharing.\nWe introduce a federated learning inverse probability of treatment weighted\n(IPTW) method for time-to-event outcomes called FedECA which eases the\nimplementation of ECA by limiting patients' data exposure. We show with\nextensive experiments that FedECA outperforms its closest competitor,\nmatching-adjusted indirect comparison (MAIC), in terms of statistical power and\nability to balance the treatment and control groups. To encourage the use of\nsuch methods, we publicly release our code which relies on Substra, an\nopen-source FL software with proven experience in privacy-sensitive contexts.\n","authors":["Jean Ogier du Terrail","Quentin Klopfenstein","Honghao Li","Imke Mayer","Nicolas Loiseau","Mohammad Hallal","Félix Balazard","Mathieu Andreux"],"pdf_url":"https://arxiv.org/pdf/2311.16984v1.pdf","comment":"code available at: https://github.com/owkin/fedeca"},{"id":"http://arxiv.org/abs/2311.16977v1","updated":"2023-11-28T17:25:16Z","published":"2023-11-28T17:25:16Z","title":"Bidirectional Reactive Programming for Machine Learning","summary":" Reactive languages are dedicated to the programming of systems which interact\ncontinuously and concurrently with their environment. Values take the form of\nunbounded streams modeling the (discrete) passing of time or the sequence of\nconcurrent interactions. While conventional reactivity models recurrences\nforward in time, we introduce a symmetric reactive construct enabling backward\nrecurrences. Constraints on the latter allow to make the implementation\npractical. Machine Learning (ML) systems provide numerous motivations for all\nof this: we demonstrate that reverse-mode automatic differentiation,\nbackpropagation, batch normalization, bidirectional recurrent neural networks,\ntraining and reinforcement learning algorithms, are all naturally captured as\nbidirectional reactive programs.\n","authors":["Dumitru Potop Butucaru","Albert Cohen","Gordon Plotkin","Hugo Pompougnac"],"pdf_url":"https://arxiv.org/pdf/2311.16977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16847v2","updated":"2023-11-28T17:24:29Z","published":"2023-08-31T16:26:17Z","title":"Diffusion Models for Interferometric Satellite Aperture Radar","summary":" Probabilistic Diffusion Models (PDMs) have recently emerged as a very\npromising class of generative models, achieving high performance in natural\nimage generation. However, their performance relative to non-natural images,\nlike radar-based satellite data, remains largely unknown. Generating large\namounts of synthetic (and especially labelled) satellite data is crucial to\nimplement deep-learning approaches for the processing and analysis of\n(interferometric) satellite aperture radar data. Here, we leverage PDMs to\ngenerate several radar-based satellite image datasets. We show that PDMs\nsucceed in generating images with complex and realistic structures, but that\nsampling time remains an issue. Indeed, accelerated sampling strategies, which\nwork well on simple image datasets like MNIST, fail on our radar datasets. We\nprovide a simple and versatile open-source\nhttps://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and\nevaluate PDMs using any dataset on a single GPU.\n","authors":["Alexandre Tuel","Thomas Kerdreux","Claudia Hulbert","Bertrand Rouet-Leduc"],"pdf_url":"https://arxiv.org/pdf/2308.16847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16964v1","updated":"2023-11-28T17:12:03Z","published":"2023-11-28T17:12:03Z","title":"Machine learning force-field models for metallic spin glass","summary":" Metallic spin glass systems, such as dilute magnetic alloys, are\ncharacterized by randomly distributed local moments coupled to each other\nthrough a long-range electron-mediated effective interaction. We present a\nscalable machine learning (ML) framework for dynamical simulations of metallic\nspin glasses. A Behler-Parrinello type neural-network model, based on the\nprinciple of locality, is developed to accurately and efficiently predict\nelectron-induced local magnetic fields that drive the spin dynamics. A crucial\ncomponent of the ML model is a proper symmetry-invariant representation of\nlocal magnetic environment which is direct input to the neural net. We develop\nsuch a magnetic descriptor by incorporating the spin degrees of freedom into\nthe atom-centered symmetry function methods which are widely used in ML\nforce-field models for quantum molecular dynamics. We apply our approach to\nstudy the relaxation dynamics of an amorphous generalization of the s-d model.\nOur work highlights the promising potential of ML models for large-scale\ndynamical modeling of itinerant magnets with quenched disorder.\n","authors":["Menglin Shi","Sheng Zhang","Gia-Wei Chern"],"pdf_url":"https://arxiv.org/pdf/2311.16964v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.16956v1","updated":"2023-11-28T17:03:56Z","published":"2023-11-28T17:03:56Z","title":"Adaptive Step Sizes for Preconditioned Stochastic Gradient Descent","summary":" This paper proposes a novel approach to adaptive step sizes in stochastic\ngradient descent (SGD) by utilizing quantities that we have identified as\nnumerically traceable -- the Lipschitz constant for gradients and a concept of\nthe local variance in search directions. Our findings yield a nearly\nhyperparameter-free algorithm for stochastic optimization, which has provable\nconvergence properties when applied to quadratic problems and exhibits truly\nproblem adaptive behavior on classical image classification tasks. Our\nframework enables the potential inclusion of a preconditioner, thereby enabling\nthe implementation of adaptive step sizes for stochastic second-order\noptimization methods.\n","authors":["Frederik Köhne","Leonie Kreis","Anton Schiela","Roland Herzog"],"pdf_url":"https://arxiv.org/pdf/2311.16956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04486v2","updated":"2023-11-28T17:02:31Z","published":"2023-10-06T15:45:28Z","title":"T-Rep: Representation Learning for Time Series using Time-Embeddings","summary":" Multivariate time series present challenges to standard machine learning\ntechniques, as they are often unlabeled, high dimensional, noisy, and contain\nmissing data. To address this, we propose T-Rep, a self-supervised method to\nlearn time series representations at a timestep granularity. T-Rep learns\nvector embeddings of time alongside its feature extractor, to extract temporal\nfeatures such as trend, periodicity, or distribution shifts from the signal.\nThese time-embeddings are leveraged in pretext tasks, to incorporate smooth and\nfine-grained temporal dependencies in the representations, as well as reinforce\nrobustness to missing data. We evaluate T-Rep on downstream classification,\nforecasting, and anomaly detection tasks. It is compared to existing\nself-supervised algorithms for time series, which it outperforms in all three\ntasks. We test T-Rep in missing data regimes, where it proves more resilient\nthan its counterparts. Finally, we provide latent space visualisation\nexperiments, highlighting the interpretability of the learned representations.\n","authors":["Archibald Fraikin","Adrien Bennetot","Stéphanie Allassonnière"],"pdf_url":"https://arxiv.org/pdf/2310.04486v2.pdf","comment":"Under review at ICLR 2024"},{"id":"http://arxiv.org/abs/2310.06328v3","updated":"2023-11-28T16:59:02Z","published":"2023-10-10T05:54:00Z","title":"Antenna Response Consistency Driven Self-supervised Learning for\n WIFI-based Human Activity Recognition","summary":" Self-supervised learning (SSL) for WiFi-based human activity recognition\n(HAR) holds great promise due to its ability to address the challenge of\ninsufficient labeled data. However, directly transplanting SSL algorithms,\nespecially contrastive learning, originally designed for other domains to CSI\ndata, often fails to achieve the expected performance. We attribute this issue\nto the inappropriate alignment criteria, which disrupt the semantic distance\nconsistency between the feature space and the input space. To address this\nchallenge, we introduce \\textbf{A}ntenna \\textbf{R}esponse \\textbf{C}onsistency\n(ARC) as a solution to define proper alignment criteria. ARC is designed to\nretain semantic information from the input space while introducing robustness\nto real-world noise. Moreover, we substantiate the effectiveness of ARC through\na comprehensive set of experiments, demonstrating its capability to enhance the\nperformance of self-supervised learning for WiFi-based HAR by achieving an\nincrease of over 5\\% in accuracy in most cases and achieving a best accuracy of\n94.97\\%.\n","authors":["Ke Xu","Jiangtao Wang","Hongyuan Zhu","Dingchang Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.06328v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13658v3","updated":"2023-11-28T16:47:56Z","published":"2023-09-24T14:53:51Z","title":"Fantastic Generalization Measures are Nowhere to be Found","summary":" We study the notion of a generalization bound being uniformly tight, meaning\nthat the difference between the bound and the population loss is small for all\nlearning algorithms and all population distributions. Numerous generalization\nbounds have been proposed in the literature as potential explanations for the\nability of neural networks to generalize in the overparameterized setting.\nHowever, in their paper ``Fantastic Generalization Measures and Where to Find\nThem,'' Jiang et al. (2020) examine more than a dozen generalization bounds,\nand show empirically that none of them are uniformly tight. This raises the\nquestion of whether uniformly-tight generalization bounds are at all possible\nin the overparameterized setting. We consider two types of generalization\nbounds: (1) bounds that may depend on the training set and the learned\nhypothesis (e.g., margin bounds). We prove mathematically that no such bound\ncan be uniformly tight in the overparameterized setting; (2) bounds that may in\naddition also depend on the learning algorithm (e.g., stability bounds). For\nthese bounds, we show a trade-off between the algorithm's performance and the\nbound's tightness. Namely, if the algorithm achieves good accuracy on certain\ndistributions, then no generalization bound can be uniformly tight for it in\nthe overparameterized setting. We explain how these formal results can, in our\nview, inform research on generalization bounds for neural networks, while\nstressing that other interpretations of these results are also possible.\n","authors":["Michael Gastpar","Ido Nachum","Jonathan Shafer","Thomas Weinberger"],"pdf_url":"https://arxiv.org/pdf/2309.13658v3.pdf","comment":"34 pages, 1 figure. Minor fix: subsection 6.2 -> section 7"},{"id":"http://arxiv.org/abs/2311.16943v1","updated":"2023-11-28T16:46:44Z","published":"2023-11-28T16:46:44Z","title":"Image segmentation with traveling waves in an exactly solvable recurrent\n neural network","summary":" We study image segmentation using spatiotemporal dynamics in a recurrent\nneural network where the state of each unit is given by a complex number. We\nshow that this network generates sophisticated spatiotemporal dynamics that can\neffectively divide an image into groups according to a scene's structural\ncharacteristics. Using an exact solution of the recurrent network's dynamics,\nwe present a precise description of the mechanism underlying object\nsegmentation in this network, providing a clear mathematical interpretation of\nhow the network performs this task. We then demonstrate a simple algorithm for\nobject segmentation that generalizes across inputs ranging from simple\ngeometric objects in grayscale images to natural images. Object segmentation\nacross all images is accomplished with one recurrent neural network that has a\nsingle, fixed set of weights. This demonstrates the expressive potential of\nrecurrent neural networks when constructed using a mathematical approach that\nbrings together their structure, dynamics, and computation.\n","authors":["Luisa H. B. Liboni","Roberto C. Budzinski","Alexandra N. Busch","Sindy Löwe","Thomas A. Keller","Max Welling","Lyle E. Muller"],"pdf_url":"https://arxiv.org/pdf/2311.16943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16941v1","updated":"2023-11-28T16:46:14Z","published":"2023-11-28T16:46:14Z","title":"Debiasing Multimodal Models via Causal Information Minimization","summary":" Most existing debiasing methods for multimodal models, including causal\nintervention and inference methods, utilize approximate heuristics to represent\nthe biases, such as shallow features from early stages of training or unimodal\nfeatures for multimodal tasks like VQA, etc., which may not be accurate. In\nthis paper, we study bias arising from confounders in a causal graph for\nmultimodal data and examine a novel approach that leverages causally-motivated\ninformation minimization to learn the confounder representations. Robust\npredictive features contain diverse information that helps a model generalize\nto out-of-distribution data. Hence, minimizing the information content of\nfeatures obtained from a pretrained biased model helps learn the simplest\npredictive features that capture the underlying data distribution. We treat\nthese features as confounder representations and use them via methods motivated\nby causal theory to remove bias from models. We find that the learned\nconfounder representations indeed capture dataset biases, and the proposed\ndebiasing methods improve out-of-distribution (OOD) performance on multiple\nmultimodal datasets without sacrificing in-distribution performance.\nAdditionally, we introduce a novel metric to quantify the sufficiency of\nspurious features in models' predictions that further demonstrates the\neffectiveness of our proposed methods. Our code is available at:\nhttps://github.com/Vaidehi99/CausalInfoMin\n","authors":["Vaidehi Patil","Adyasha Maharana","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2311.16941v1.pdf","comment":"EMNLP 2023 Findings (16 pages)"},{"id":"http://arxiv.org/abs/2309.13411v2","updated":"2023-11-28T16:41:35Z","published":"2023-09-23T15:48:35Z","title":"Towards Attributions of Input Variables in a Coalition","summary":" This paper aims to develop a new attribution method to explain the conflict\nbetween individual variables' attributions and their coalition's attribution\nfrom a fully new perspective. First, we find that the Shapley value can be\nreformulated as the allocation of Harsanyi interactions encoded by the AI\nmodel. Second, based the re-alloction of interactions, we extend the Shapley\nvalue to the attribution of coalitions. Third we ective. We derive the\nfundamental mechanism behind the conflict. This conflict come from the\ninteraction containing partial variables in their coalition.\n","authors":["Xinhao Zheng","Huiqi Deng","Bo Fan","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.13411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03059v3","updated":"2023-11-28T16:31:34Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":" The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code will be released at\nhttps://github.com/Even-JK/PEFT-3D.\n","authors":["Ivan Tang","Ray Zhang","Zoey Guo","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2310.03059v3.pdf","comment":"10 pages. The specialized PEFT framework for 3D pre-trained models,\n which achieves competitive performance to full fine-tuning, and significantly\n reduces the computational resources. Project page:\n https://github.com/Even-JK/PEFT-3D"},{"id":"http://arxiv.org/abs/2206.10479v3","updated":"2023-11-28T16:23:08Z","published":"2022-06-21T15:44:49Z","title":"Policy Learning with Asymmetric Counterfactual Utilities","summary":" Data-driven decision making plays an important role even in high stakes\nsettings like medicine and public policy. Learning optimal policies from\nobserved data requires a careful formulation of the utility function whose\nexpected value is maximized across a population. Although researchers typically\nuse utilities that depend on observed outcomes alone, in many settings the\ndecision maker's utility function is more properly characterized by the joint\nset of potential outcomes under all actions. For example, the Hippocratic\nprinciple to \"do no harm\" implies that the cost of causing death to a patient\nwho would otherwise survive without treatment is greater than the cost of\nforgoing life-saving treatment. We consider optimal policy learning with\nasymmetric counterfactual utility functions of this form that consider the\njoint set of potential outcomes. We show that asymmetric counterfactual\nutilities lead to an unidentifiable expected utility function, and so we first\npartially identify it. Drawing on statistical decision theory, we then derive\nminimax decision rules by minimizing the maximum expected utility loss relative\nto different alternative policies. We show that one can learn minimax loss\ndecision rules from observed data by solving intermediate classification\nproblems, and establish that the finite sample excess expected utility loss of\nthis procedure is bounded by the regret of these intermediate classifiers. We\napply this conceptual framework and methodology to the decision about whether\nor not to use right heart catheterization for patients with possible pulmonary\nhypertension.\n","authors":["Eli Ben-Michael","Kosuke Imai","Zhichao Jiang"],"pdf_url":"https://arxiv.org/pdf/2206.10479v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16909v1","updated":"2023-11-28T16:12:50Z","published":"2023-11-28T16:12:50Z","title":"Multinomial belief networks","summary":" A Bayesian approach to machine learning is attractive when we need to\nquantify uncertainty, deal with missing observations, when samples are scarce,\nor when the data is sparse. All of these commonly apply when analysing\nhealthcare data. To address these analytical requirements, we propose a deep\ngenerative model for multinomial count data where both the weights and hidden\nunits of the network are Dirichlet distributed. A Gibbs sampling procedure is\nformulated that takes advantage of a series of augmentation relations,\nanalogous to the Zhou-Cong-Chen model. We apply the model on small handwritten\ndigits, and a large experimental dataset of DNA mutations in cancer, and we\nshow how the model is able to extract biologically meaningful meta-signatures\nin a fully data-driven way.\n","authors":["H. C. Donker","D. Neijzen","G. A. Lunter"],"pdf_url":"https://arxiv.org/pdf/2311.16909v1.pdf","comment":"9 pages, 3 figs; supplement: 13 pages"},{"id":"http://arxiv.org/abs/2310.08659v4","updated":"2023-11-28T16:06:59Z","published":"2023-10-12T18:34:08Z","title":"LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models","summary":" Quantization is an indispensable technique for serving Large Language Models\n(LLMs) and has recently found its way into LoRA fine-tuning. In this work we\nfocus on the scenario where quantization and LoRA fine-tuning are applied\ntogether on a pre-trained model. In such cases it is common to observe a\nconsistent gap in the performance on downstream tasks between full fine-tuning\nand quantization plus LoRA fine-tuning approach. In response, we propose LoftQ\n(LoRA-Fine-Tuning-aware Quantization), a novel quantization framework that\nsimultaneously quantizes an LLM and finds a proper low-rank initialization for\nLoRA fine-tuning. Such an initialization alleviates the discrepancy between the\nquantized and full-precision model and significantly improves generalization in\ndownstream tasks. We evaluate our method on natural language understanding,\nquestion answering, summarization, and natural language generation tasks.\nExperiments show that our method is highly effective and outperforms existing\nquantization methods, especially in the challenging 2-bit and 2/4-bit mixed\nprecision regimes. The code is available on https://github.com/yxli2123/LoftQ.\n","authors":["Yixiao Li","Yifan Yu","Chen Liang","Pengcheng He","Nikos Karampatziakis","Weizhu Chen","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08659v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06627v3","updated":"2023-11-28T15:57:16Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09790v3","updated":"2023-11-28T15:53:00Z","published":"2023-11-16T11:10:38Z","title":"Breaking Boundaries: Balancing Performance and Robustness in Deep\n Wireless Traffic Forecasting","summary":" Balancing the trade-off between accuracy and robustness is a long-standing\nchallenge in time series forecasting. While most of existing robust algorithms\nhave achieved certain suboptimal performance on clean data, sustaining the same\nperformance level in the presence of data perturbations remains extremely hard.\nIn this paper, we study a wide array of perturbation scenarios and propose\nnovel defense mechanisms against adversarial attacks using real-world telecom\ndata. We compare our strategy against two existing adversarial training\nalgorithms under a range of maximal allowed perturbations, defined using\n$\\ell_{\\infty}$-norm, $\\in [0.1,0.4]$. Our findings reveal that our hybrid\nstrategy, which is composed of a classifier to detect adversarial examples, a\ndenoiser to eliminate noise from the perturbed data samples, and a standard\nforecaster, achieves the best performance on both clean and perturbed data. Our\noptimal model can retain up to $92.02\\%$ the performance of the original\nforecasting model in terms of Mean Squared Error (MSE) on clean data, while\nbeing more robust than the standard adversarially trained models on perturbed\ndata. Its MSE is 2.71$\\times$ and 2.51$\\times$ lower than those of comparing\nmethods on normal and perturbed data, respectively. In addition, the components\nof our models can be trained in parallel, resulting in better computational\nefficiency. Our results indicate that we can optimally balance the trade-off\nbetween the performance and robustness of forecasting models by improving the\nclassifier and denoiser, even in the presence of sophisticated and destructive\npoisoning attacks.\n","authors":["Romain Ilbert","Thai V. Hoang","Zonghua Zhang","Themis Palpanas"],"pdf_url":"https://arxiv.org/pdf/2311.09790v3.pdf","comment":"Accepted for presentation at the ARTMAN workshop, part of the ACM\n Conference on Computer and Communications Security (CCS), 2023"},{"id":"http://arxiv.org/abs/2311.16894v1","updated":"2023-11-28T15:46:12Z","published":"2023-11-28T15:46:12Z","title":"Dendrogram distance: an evaluation metric for generative networks using\n hierarchical clustering","summary":" We present a novel metric for generative modeling evaluation, focusing\nprimarily on generative networks. The method uses dendrograms to represent real\nand fake data, allowing for the divergence between training and generated\nsamples to be computed. This metric focus on mode collapse, targeting\ngenerators that are not able to capture all modes in the training set. To\nevaluate the proposed method it is introduced a validation scheme based on\nsampling from real datasets, therefore the metric is evaluated in a controlled\nenvironment and proves to be competitive with other state-of-the-art\napproaches.\n","authors":["Gustavo Sutter Carvalho","Moacir Antonelli Ponti"],"pdf_url":"https://arxiv.org/pdf/2311.16894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13131v2","updated":"2023-11-28T15:41:46Z","published":"2022-11-23T17:04:20Z","title":"FeTrIL: Feature Translation for Exemplar-Free Class-Incremental Learning","summary":" Exemplar-free class-incremental learning is very challenging due to the\nnegative effect of catastrophic forgetting. A balance between stability and\nplasticity of the incremental process is needed in order to obtain good\naccuracy for past as well as new classes. Existing exemplar-free\nclass-incremental methods focus either on successive fine tuning of the model,\nthus favoring plasticity, or on using a feature extractor fixed after the\ninitial incremental state, thus favoring stability. We introduce a method which\ncombines a fixed feature extractor and a pseudo-features generator to improve\nthe stability-plasticity balance. The generator uses a simple yet effective\ngeometric translation of new class features to create representations of past\nclasses, made of pseudo-features. The translation of features only requires the\nstorage of the centroid representations of past classes to produce their\npseudo-features. Actual features of new classes and pseudo-features of past\nclasses are fed into a linear classifier which is trained incrementally to\ndiscriminate between all classes. The incremental process is much faster with\nthe proposed method compared to mainstream ones which update the entire deep\nmodel. Experiments are performed with three challenging datasets, and different\nincremental settings. A comparison with ten existing methods shows that our\nmethod outperforms the others in most cases.\n","authors":["Grégoire Petit","Adrian Popescu","Hugo Schindler","David Picard","Bertrand Delezoide"],"pdf_url":"https://arxiv.org/pdf/2211.13131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08415v3","updated":"2023-11-28T15:36:11Z","published":"2023-05-15T07:48:50Z","title":"Marsellus: A Heterogeneous RISC-V AI-IoT End-Node SoC with 2-to-8b DNN\n Acceleration and 30%-Boost Adaptive Body Biasing","summary":" Emerging Artificial Intelligence-enabled Internet-of-Things (AI-IoT)\nSystem-on-a-Chip (SoC) for augmented reality, personalized healthcare, and\nnano-robotics need to run many diverse tasks within a power envelope of a few\ntens of mW over a wide range of operating conditions: compute-intensive but\nstrongly quantized Deep Neural Network (DNN) inference, as well as signal\nprocessing and control requiring high-precision floating-point. We present\nMarsellus, an all-digital heterogeneous SoC for AI-IoT end-nodes fabricated in\nGlobalFoundries 22nm FDX that combines 1) a general-purpose cluster of 16\nRISC-V Digital Signal Processing (DSP) cores attuned for the execution of a\ndiverse range of workloads exploiting 4-bit and 2-bit arithmetic extensions\n(XpulpNN), combined with fused MAC&LOAD operations and floating-point support;\n2) a 2-8bit Reconfigurable Binary Engine (RBE) to accelerate 3x3 and 1x1\n(pointwise) convolutions in DNNs; 3) a set of On-Chip Monitoring (OCM) blocks\nconnected to an Adaptive Body Biasing (ABB) generator and a hardware control\nloop, enabling on-the-fly adaptation of transistor threshold voltages.\nMarsellus achieves up to 180 Gop/s or 3.32 Top/s/W on 2-bit precision\narithmetic in software, and up to 637 Gop/s or 12.4 Top/s/W on\nhardware-accelerated DNN layers.\n","authors":["Francesco Conti","Gianna Paulin","Angelo Garofalo","Davide Rossi","Alfio Di Mauro","Georg Rutishauser","Gianmarco Ottavi","Manuel Eggimann","Hayate Okuhara","Luca Benini"],"pdf_url":"https://arxiv.org/pdf/2305.08415v3.pdf","comment":"Post-print accepted by IEEE Journal of Solid-State Circuits. Fixed\n metadata (was missing one co-author), added DOI of IEEE JSSC"},{"id":"http://arxiv.org/abs/2311.16883v1","updated":"2023-11-28T15:31:31Z","published":"2023-11-28T15:31:31Z","title":"Compressing the Backward Pass of Large-Scale Neural Architectures by\n Structured Activation Pruning","summary":" The rise of Deep Neural Networks (DNNs) has led to an increase in model size\nand complexity, straining the memory capacity of GPUs. Sparsity in DNNs,\ncharacterized as structural or ephemeral, has gained attention as a solution.\nThis work focuses on ephemeral sparsity, aiming to reduce memory consumption\nduring training. It emphasizes the significance of activations, an often\noverlooked component, and their role in memory usage. This work employs\nstructured pruning in Block Sparse Compressed Row (BSR) format in combination\nwith a magnitude-based criterion to efficiently prune activations. We\nfurthermore introduce efficient block-sparse operators for GPUs and showcase\ntheir effectiveness, as well as the superior compression offered by block\nsparsity. We report the effectiveness of activation pruning by evaluating\ntraining speed, accuracy, and memory usage of large-scale neural architectures\non the example of ResMLP on image classification tasks. As a result, we observe\na memory reduction of up to 32\\% while maintaining accuracy. Ultimately, our\napproach aims to democratize large-scale model training, reduce GPU\nrequirements, and address ecological concerns.\n","authors":["Daniel Barley","Holger Fröning"],"pdf_url":"https://arxiv.org/pdf/2311.16883v1.pdf","comment":"8 pages, 10 figures, submitted to the 6th AccML workshop at HiPEAC\n conference 2024"},{"id":"http://arxiv.org/abs/2311.16882v1","updated":"2023-11-28T15:31:11Z","published":"2023-11-28T15:31:11Z","title":"Optimisation-Based Multi-Modal Semantic Image Editing","summary":" Image editing affords increased control over the aesthetics and content of\ngenerated images. Pre-existing works focus predominantly on text-based\ninstructions to achieve desired image modifications, which limit edit precision\nand accuracy. In this work, we propose an inference-time editing optimisation,\ndesigned to extend beyond textual edits to accommodate multiple editing\ninstruction types (e.g. spatial layout-based; pose, scribbles, edge maps). We\npropose to disentangle the editing task into two competing subtasks: successful\nlocal image modifications and global content consistency preservation, where\nsubtasks are guided through two dedicated loss functions. By allowing to adjust\nthe influence of each loss function, we build a flexible editing solution that\ncan be adjusted to user preferences. We evaluate our method using text, pose\nand scribble edit conditions, and highlight our ability to achieve complex\nedits, through both qualitative and quantitative experiments.\n","authors":["Bowen Li","Yongxin Yang","Steven McDonagh","Shifeng Zhang","Petru-Daniel Tudosiu","Sarah Parisot"],"pdf_url":"https://arxiv.org/pdf/2311.16882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16877v1","updated":"2023-11-28T15:26:09Z","published":"2023-11-28T15:26:09Z","title":"Imputation using training labels and classification via label imputation","summary":" Missing data is a common problem in practical settings. Various imputation\nmethods have been developed to deal with missing data. However, even though the\nlabel is usually available in the training data, the common practice of\nimputation usually only relies on the input and ignores the label. In this\nwork, we illustrate how stacking the label into the input can significantly\nimprove the imputation of the input. In addition, we propose a classification\nstrategy that initializes the predicted test label with missing values and\nstacks the label with the input for imputation. This allows imputing the label\nand the input at the same time. Also, the technique is capable of handling data\ntraining with missing labels without any prior imputation and is applicable to\ncontinuous, categorical, or mixed-type data. Experiments show promising results\nin terms of accuracy.\n","authors":["Thu Nguyen","Pål Halvorsen","Michael A. Riegler"],"pdf_url":"https://arxiv.org/pdf/2311.16877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16876v1","updated":"2023-11-28T15:25:14Z","published":"2023-11-28T15:25:14Z","title":"Digital Twin-Enhanced Deep Reinforcement Learning for Resource\n Management in Networks Slicing","summary":" Network slicing-based communication systems can dynamically and efficiently\nallocate resources for diversified services. However, due to the limitation of\nthe network interface on channel access and the complexity of the resource\nallocation, it is challenging to achieve an acceptable solution in the\npractical system without precise prior knowledge of the dynamics probability\nmodel of the service requests. Existing work attempts to solve this problem\nusing deep reinforcement learning (DRL), however, such methods usually require\na lot of interaction with the real environment in order to achieve good\nresults. In this paper, a framework consisting of a digital twin and\nreinforcement learning agents is present to handle the issue. Specifically, we\npropose to use the historical data and the neural networks to build a digital\ntwin model to simulate the state variation law of the real environment. Then,\nwe use the data generated by the network slicing environment to calibrate the\ndigital twin so that it is in sync with the real environment. Finally, DRL for\nslice optimization optimizes its own performance in this virtual\npre-verification environment. We conducted an exhaustive verification of the\nproposed digital twin framework to confirm its scalability. Specifically, we\npropose to use loss landscapes to visualize the generalization of DRL\nsolutions. We explore a distillation-based optimization scheme for lightweight\nslicing strategies. In addition, we also extend the framework to offline\nreinforcement learning, where solutions can be used to obtain intelligent\ndecisions based solely on historical data. Numerical simulation experiments\nshow that the proposed digital twin can significantly improve the performance\nof the slice optimization strategy.\n","authors":["Zhengming Zhang","Yongming Huang","Cheng Zhang","Qingbi Zheng","Luxi Yang","Xiaohu You"],"pdf_url":"https://arxiv.org/pdf/2311.16876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16872v1","updated":"2023-11-28T15:24:02Z","published":"2023-11-28T15:24:02Z","title":"A unified weighting framework for evaluating nearest neighbour\n classification","summary":" We present the first comprehensive and large-scale evaluation of classical\n(NN), fuzzy (FNN) and fuzzy rough (FRNN) nearest neighbour classification. We\nshow that existing proposals for nearest neighbour weighting can be\nstandardised in the form of kernel functions, applied to the distance values\nand/or ranks of the nearest neighbours of a test instance. Furthermore, we\nidentify three commonly used distance functions and four scaling measures. We\nsystematically evaluate these choices on a collection of 85 real-life\nclassification datasets. We find that NN, FNN and FRNN all perform best with\nBoscovich distance. NN and FRNN perform best with a combination of Samworth\nrank- and distance weights and scaling by the mean absolute deviation around\nthe median ($r_1$), the standard deviaton ($r_2$) or the interquartile range\n($r_{\\infty}^*$), while FNN performs best with only Samworth distance-weights\nand $r_1$- or $r_2$-scaling. We also introduce a new kernel based on fuzzy\nYager negation, and show that NN achieves comparable performance with Yager\ndistance-weights, which are simpler to implement than a combination of Samworth\ndistance- and rank-weights. Finally, we demonstrate that FRNN generally\noutperforms NN, which in turns performs systematically better than FNN.\n","authors":["Oliver Urs Lenz","Henri Bollaert","Chris Cornelis"],"pdf_url":"https://arxiv.org/pdf/2311.16872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15951v2","updated":"2023-11-28T15:18:43Z","published":"2023-11-27T15:57:11Z","title":"Replay across Experiments: A Natural Extension of Off-Policy RL","summary":" Replaying data is a principal mechanism underlying the stability and data\nefficiency of off-policy reinforcement learning (RL). We present an effective\nyet simple framework to extend the use of replays across multiple experiments,\nminimally adapting the RL workflow for sizeable improvements in controller\nperformance and research iteration times. At its core, Replay Across\nExperiments (RaE) involves reusing experience from previous experiments to\nimprove exploration and bootstrap learning while reducing required changes to a\nminimum in comparison to prior work. We empirically show benefits across a\nnumber of RL algorithms and challenging control domains spanning both\nlocomotion and manipulation, including hard exploration tasks from egocentric\nvision. Through comprehensive ablations, we demonstrate robustness to the\nquality and amount of data available and various hyperparameter choices.\nFinally, we discuss how our approach can be applied more broadly across\nresearch life cycles and can increase resilience by reloading data across\nrandom seeds or hyperparameter variations.\n","authors":["Dhruva Tirumala","Thomas Lampe","Jose Enrique Chen","Tuomas Haarnoja","Sandy Huang","Guy Lever","Ben Moran","Tim Hertweck","Leonard Hasenclever","Martin Riedmiller","Nicolas Heess","Markus Wulfmeier"],"pdf_url":"https://arxiv.org/pdf/2311.15951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16863v1","updated":"2023-11-28T15:09:36Z","published":"2023-11-28T15:09:36Z","title":"Power Hungry Processing: Watts Driving the Cost of AI Deployment?","summary":" Recent years have seen a surge in the popularity of commercial AI products\nbased on generative, multi-purpose AI systems promising a unified approach to\nbuilding machine learning (ML) models into technology. However, this ambition\nof \"generality\" comes at a steep cost to the environment, given the amount of\nenergy these systems require and the amount of carbon that they emit. In this\nwork, we propose the first systematic comparison of the ongoing inference cost\nof various categories of ML systems, covering both task-specific (i.e.\nfinetuned models that carry out a single task) and `general-purpose' models,\n(i.e. those trained for multiple tasks). We measure deployment cost as the\namount of energy and carbon required to perform 1,000 inferences on\nrepresentative benchmark dataset using these models. We find that\nmulti-purpose, generative architectures are orders of magnitude more expensive\nthan task-specific systems for a variety of tasks, even when controlling for\nthe number of model parameters. We conclude with a discussion around the\ncurrent trend of deploying multi-purpose generative ML systems, and caution\nthat their utility should be more intentionally weighed against increased costs\nin terms of energy and emissions. All the data from our study can be accessed\nvia an interactive demo to carry out further exploration and analysis.\n","authors":["Alexandra Sasha Luccioni","Yacine Jernite","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2311.16863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16860v1","updated":"2023-11-28T15:07:25Z","published":"2023-11-28T15:07:25Z","title":"Data-efficient operator learning for solving high Mach number fluid flow\n problems","summary":" We consider the problem of using SciML to predict solutions of high Mach\nfluid flows over irregular geometries. In this setting, data is limited, and so\nit is desirable for models to perform well in the low-data setting. We show\nthat Neural Basis Functions (NBF), which learns a basis of behavior modes from\nthe data and then uses this basis to make predictions, is more effective than a\nbasis-unaware baseline model. In addition, we identify continuing challenges in\nthe space of predicting solutions for this type of problem.\n","authors":["Noah Ford","Victor J. Leon","Honest Merman","Jeffrey Gilbert","Alexander New"],"pdf_url":"https://arxiv.org/pdf/2311.16860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16856v1","updated":"2023-11-28T15:05:13Z","published":"2023-11-28T15:05:13Z","title":"Attentional Graph Neural Networks for Robust Massive Network\n Localization","summary":" Graph neural networks (GNNs) have gained significant popularity for\nclassification tasks in machine learning, yet their applications to regression\nproblems remain limited. Concurrently, attention mechanisms have emerged as\npowerful tools in sequential learning tasks. In this paper, we employ GNNs and\nattention mechanisms to address a classical but challenging nonlinear\nregression problem: network localization. We propose a novel GNN-based network\nlocalization method that achieves exceptional stability and accuracy in the\npresence of severe non-line-of-sight (NLOS) propagations, while eliminating the\nneed for laborious offline calibration or NLOS identification. Extensive\nexperimental results validate the effectiveness and high accuracy of our\nGNN-based localization model, particularly in challenging NLOS scenarios.\nHowever, the proposed GNN-based model exhibits limited flexibility, and its\naccuracy is highly sensitive to a specific hyperparameter that determines the\ngraph structure. To address the limitations and extend the applicability of the\nGNN-based model to real scenarios, we introduce two attentional graph neural\nnetworks (AGNNs) that offer enhanced flexibility and the ability to\nautomatically learn the optimal hyperparameter for each node. Experimental\nresults confirm that the AGNN models are able to enhance localization accuracy,\nproviding a promising solution for real-world applications. We also provide\nsome analyses of the improved performance achieved by the AGNN models from the\nperspectives of dynamic attention and signal denoising characteristics.\n","authors":["Wenzhong Yan","Juntao Wang","Feng Yin","Abdelhak M. Zoubir"],"pdf_url":"https://arxiv.org/pdf/2311.16856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16849v1","updated":"2023-11-28T15:00:11Z","published":"2023-11-28T15:00:11Z","title":"Identifiable Feature Learning for Spatial Data with Nonlinear ICA","summary":" Recently, nonlinear ICA has surfaced as a popular alternative to the many\nheuristic models used in deep representation learning and disentanglement. An\nadvantage of nonlinear ICA is that a sophisticated identifiability theory has\nbeen developed; in particular, it has been proven that the original components\ncan be recovered under sufficiently strong latent dependencies. Despite this\ngeneral theory, practical nonlinear ICA algorithms have so far been mainly\nlimited to data with one-dimensional latent dependencies, especially\ntime-series data. In this paper, we introduce a new nonlinear ICA framework\nthat employs $t$-process (TP) latent components which apply naturally to data\nwith higher-dimensional dependency structures, such as spatial and\nspatio-temporal data. In particular, we develop a new learning and inference\nalgorithm that extends variational inference methods to handle the combination\nof a deep neural network mixing function with the TP prior, and employs the\nmethod of inducing points for computational efficacy. On the theoretical side,\nwe show that such TP independent components are identifiable under very general\nconditions. Further, Gaussian Process (GP) nonlinear ICA is established as a\nlimit of the TP Nonlinear ICA model, and we prove that the identifiability of\nthe latent components at this GP limit is more restricted. Namely, those\ncomponents are identifiable if and only if they have distinctly different\ncovariance kernels. Our algorithm and identifiability theorems are explored on\nsimulated spatial data and real world spatio-temporal data.\n","authors":["Hermanni Hälvä","Jonathan So","Richard E. Turner","Aapo Hyvärinen"],"pdf_url":"https://arxiv.org/pdf/2311.16849v1.pdf","comment":"Work under review"},{"id":"http://arxiv.org/abs/2307.11957v4","updated":"2023-11-28T14:53:32Z","published":"2023-07-22T01:56:58Z","title":"High-performance real-world optical computing trained by in situ\n model-free optimization","summary":" Optical computing systems provide high-speed and low-energy data processing\nbut face deficiencies in computationally demanding training and\nsimulation-to-reality gaps. We propose a model-free optimization (MFO) method\nbased on a score gradient estimation algorithm for computationally efficient in\nsitu training of optical computing systems. This approach treats an optical\ncomputing system as a black box and back-propagates the loss directly to the\noptical computing weights' probability distributions, circumventing the need\nfor a computationally heavy and biased system simulation. Our experiments on a\nsingle-layer diffractive optical computing system show that MFO outperforms\nhybrid training on the MNIST and FMNIST datasets. Furthermore, we demonstrate\nimage-free and high-speed classification of cells from their phase maps. Our\nmethod's model-free and high-performance nature, combined with its low demand\nfor computational resources, expedites the transition of optical computing from\nlaboratory demonstrations to real-world applications.\n","authors":["Guangyuan Zhao","Xin Shu"],"pdf_url":"https://arxiv.org/pdf/2307.11957v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16834v1","updated":"2023-11-28T14:51:06Z","published":"2023-11-28T14:51:06Z","title":"Modular Neural Networks for Time Series Forecasting: Interpretability\n and Feature Selection using Attention","summary":" Multivariate time series have many applications, from healthcare and\nmeteorology to life science. Although deep learning models have shown excellent\npredictive performance for time series, they have been criticised for being\n\"black-boxes\" or non-interpretable. This paper proposes a novel modular neural\nnetwork model for multivariate time series prediction that is interpretable by\nconstruction. A recurrent neural network learns the temporal dependencies in\nthe data while an attention-based feature selection component selects the most\nrelevant features and suppresses redundant features used in the learning of the\ntemporal dependencies. A modular deep network is trained from the selected\nfeatures independently to show the users how features influence outcomes,\nmaking the model interpretable. Experimental results show that this approach\ncan outperform state-of-the-art interpretable Neural Additive Models (NAM) and\nvariations thereof in both regression and classification of time series tasks,\nachieving a predictive performance that is comparable to the top\nnon-interpretable methods for time series, LSTM and XGBoost.\n","authors":["Qiqi Su","Christos Kloukinas","Artur d'Garcez"],"pdf_url":"https://arxiv.org/pdf/2311.16834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16833v1","updated":"2023-11-28T14:50:50Z","published":"2023-11-28T14:50:50Z","title":"1-Lipschitz Layers Compared: Memory, Speed, and Certifiable Robustness","summary":" The robustness of neural networks against input perturbations with bounded\nmagnitude represents a serious concern in the deployment of deep learning\nmodels in safety-critical systems. Recently, the scientific community has\nfocused on enhancing certifiable robustness guarantees by crafting 1-Lipschitz\nneural networks that leverage Lipschitz bounded dense and convolutional layers.\nAlthough different methods have been proposed in the literature to achieve this\ngoal, understanding the performance of such methods is not straightforward,\nsince different metrics can be relevant (e.g., training time, memory usage,\naccuracy, certifiable robustness) for different applications. For this reason,\nthis work provides a thorough theoretical and empirical comparison between\nmethods by evaluating them in terms of memory usage, speed, and certifiable\nrobust accuracy. The paper also provides some guidelines and recommendations to\nsupport the user in selecting the methods that work best depending on the\navailable resources. We provide code at\nhttps://github.com/berndprach/1LipschitzLayersCompared.\n","authors":["Bernd Prach","Fabio Brau","Giorgio Buttazzo","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2311.16833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16829v1","updated":"2023-11-28T14:48:22Z","published":"2023-11-28T14:48:22Z","title":"Decomposer: Semi-supervised Learning of Image Restoration and Image\n Decomposition","summary":" We present Decomposer, a semi-supervised reconstruction model that decomposes\ndistorted image sequences into their fundamental building blocks - the original\nimage and the applied augmentations, i.e., shadow, light, and occlusions. To\nsolve this problem, we use the SIDAR dataset that provides a large number of\ndistorted image sequences: each sequence contains images with shadows,\nlighting, and occlusions applied to an undistorted version. Each distortion\nchanges the original signal in different ways, e.g., additive or multiplicative\nnoise. We propose a transformer-based model to explicitly learn this\ndecomposition. The sequential model uses 3D Swin-Transformers for\nspatio-temporal encoding and 3D U-Nets as prediction heads for individual parts\nof the decomposition. We demonstrate that by separately pre-training our model\non weakly supervised pseudo labels, we can steer our model to optimize for our\nambiguous problem definition and learn to differentiate between the different\nimage distortions.\n","authors":["Boris Meinardus","Mariusz Trzeciakiewicz","Tim Herzig","Monika Kwiatkowski","Simon Matern","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2311.16829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16822v1","updated":"2023-11-28T14:36:43Z","published":"2023-11-28T14:36:43Z","title":"Large Language Models Suffer From Their Own Output: An Analysis of the\n Self-Consuming Training Loop","summary":" Large language models (LLM) have become state of the art in many benchmarks\nand conversational LLM applications like ChatGPT are now widely used by the\npublic. Those LLMs can be used to generate large amounts of content which is\nposted on the internet to various platforms. As LLMs are trained on datasets\nusually collected from the internet, this LLM-generated content might be used\nto train the next generation of LLMs. Therefore, a self-consuming training loop\nemerges in which new LLM generations are trained on the output from the\nprevious generations. We empirically study this self-consuming training loop\nusing a novel dataset to analytically and accurately measure quality and\ndiversity of generated outputs. We find that this self-consuming training loop\ninitially improves both quality and diversity. However, after a few generations\nthe output inevitably degenerates in diversity. We find that the rate of\ndegeneration depends on the proportion of real and generated data.\n","authors":["Martin Briesch","Dominik Sobania","Franz Rothlauf"],"pdf_url":"https://arxiv.org/pdf/2311.16822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10364v4","updated":"2023-11-28T14:20:28Z","published":"2023-08-20T20:49:15Z","title":"SE(3) Equivariant Augmented Coupling Flows","summary":" Coupling normalizing flows allow for fast sampling and density evaluation,\nmaking them the tool of choice for probabilistic modeling of physical systems.\nHowever, the standard coupling architecture precludes endowing flows that\noperate on the Cartesian coordinates of atoms with the SE(3) and permutation\ninvariances of physical systems. This work proposes a coupling flow that\npreserves SE(3) and permutation equivariance by performing coordinate splits\nalong additional augmented dimensions. At each layer, the flow maps atoms'\npositions into learned SE(3) invariant bases, where we apply standard flow\ntransformations, such as monotonic rational-quadratic splines, before returning\nto the original basis. Crucially, our flow preserves fast sampling and density\nevaluation, and may be used to produce unbiased estimates of expectations with\nrespect to the target distribution via importance sampling. When trained on the\nDW4, LJ13, and QM9-positional datasets, our flow is competitive with\nequivariant continuous normalizing flows, while allowing sampling more than an\norder of magnitude faster. Moreover, to the best of our knowledge, we are the\nfirst to learn the full Boltzmann distribution of alanine dipeptide by only\nmodeling the Cartesian positions of its atoms. Lastly, we demonstrate that our\nflow can be trained to approximately sample from the Boltzmann distribution of\nthe DW4 and LJ13 particle systems using only their energy functions.\n","authors":["Laurence I. Midgley","Vincent Stimper","Javier Antorán","Emile Mathieu","Bernhard Schölkopf","José Miguel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2308.10364v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12089v2","updated":"2023-11-28T14:15:33Z","published":"2023-11-20T16:09:06Z","title":"Explaining Deep Learning Models for Age-related Gait Classification\n based on time series acceleration","summary":" Gait analysis holds significant importance in monitoring daily health,\nparticularly among older adults. Advancements in sensor technology enable the\ncapture of movement in real-life environments and generate big data. Machine\nlearning, notably deep learning (DL), shows promise to use these big data in\ngait analysis. However, the inherent black-box nature of these models poses\nchallenges for their clinical application. This study aims to enhance\ntransparency in DL-based gait classification for aged-related gait patterns\nusing Explainable Artificial Intelligence, such as SHAP.\n A total of 244 subjects, comprising 129 adults and 115 older adults (age>65),\nwere included. They performed a 3-minute walking task while accelerometers were\naffixed to the lumbar segment L3. DL models, convolutional neural network (CNN)\nand gated recurrent unit (GRU), were trained using 1-stride and 8-stride\naccelerations, respectively, to classify adult and older adult groups. SHAP was\nemployed to explain the models' predictions.\n CNN achieved a satisfactory performance with an accuracy of 81.4% and an AUC\nof 0.89, and GRU demonstrated promising results with an accuracy of 84.5% and\nan AUC of 0.94. SHAP analysis revealed that both CNN and GRU assigned higher\nSHAP values to the data from vertical and walking directions, particularly\nemphasizing data around heel contact, spanning from the terminal swing to\nloading response phases. Furthermore, SHAP values indicated that GRU did not\ntreat every stride equally.\n CNN accurately distinguished between adults and older adults based on the\ncharacteristics of a single stride's data. GRU achieved accurate classification\nby considering the relationships and subtle differences between strides. In\nboth models, data around heel contact emerged as most critical, suggesting\ndifferences in acceleration and deceleration patterns during walking between\ndifferent age groups.\n","authors":["Xiaoping Zheng","Bert Otten","Michiel F Reneman","Claudine JC Lamoth"],"pdf_url":"https://arxiv.org/pdf/2311.12089v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01837v2","updated":"2023-11-28T13:36:58Z","published":"2023-10-03T07:01:23Z","title":"Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation","summary":" Current AI-based methods do not provide comprehensible physical\ninterpretations of the utilized data, extracted features, and\npredictions/inference operations. As a result, deep learning models trained\nusing high-resolution satellite imagery lack transparency and explainability\nand can be merely seen as a black box, which limits their wide-level adoption.\nExperts need help understanding the complex behavior of AI models and the\nunderlying decision-making process. The explainable artificial intelligence\n(XAI) field is an emerging field providing means for robust, practical, and\ntrustworthy deployment of AI models. Several XAI techniques have been proposed\nfor image classification tasks, whereas the interpretation of image\nsegmentation remains largely unexplored. This paper offers to bridge this gap\nby adapting the recent XAI classification algorithms and making them usable for\nmuti-class image segmentation, where we mainly focus on buildings' segmentation\nfrom high-resolution satellite images. To benchmark and compare the performance\nof the proposed approaches, we introduce a new XAI evaluation methodology and\nmetric based on \"Entropy\" to measure the model uncertainty. Conventional XAI\nevaluation methods rely mainly on feeding area-of-interest regions from the\nimage back to the pre-trained (utility) model and then calculating the average\nchange in the probability of the target class. Those evaluation metrics lack\nthe needed robustness, and we show that using Entropy to monitor the model\nuncertainty in segmenting the pixels within the target class is more suitable.\nWe hope this work will pave the way for additional XAI research for image\nsegmentation and applications in the remote sensing discipline.\n","authors":["Abdul Karim Gizzini","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18228v2","updated":"2023-11-28T13:34:58Z","published":"2023-05-26T16:35:20Z","title":"SR-OOD: Out-of-Distribution Detection via Sample Repairing","summary":" Out-of-distribution (OOD) detection is a crucial task for ensuring the\nreliability and robustness of machine learning models. Recent works have shown\nthat generative models often assign high confidence scores to OOD samples,\nindicating that they fail to capture the semantic information of the data. To\ntackle this problem, we take advantage of sample repairing and propose a novel\nOOD detection framework, namely SR-OOD. Our framework leverages the idea that\nrepairing an OOD sample can reveal its semantic inconsistency with the\nin-distribution data. Specifically, our framework consists of two components: a\nsample repairing module and a detection module. The sample repairing module\napplies erosion to an input sample and uses a generative adversarial network to\nrepair it. The detection module then determines whether the input sample is OOD\nusing a distance metric. Our framework does not require any additional data or\nlabel information for detection, making it applicable to various scenarios. We\nconduct extensive experiments on three image datasets: CIFAR-10, CelebA, and\nPokemon. The results demonstrate that our approach achieves superior\nperformance over the state-of-the-art generative methods in OOD detection.\n","authors":["Rui Sun","Andi Zhang","Haiming Zhang","Jinke Ren","Yao Zhu","Ruimao Zhang","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2305.18228v2.pdf","comment":"This is an updated version of the paper"},{"id":"http://arxiv.org/abs/2311.16771v1","updated":"2023-11-28T13:25:34Z","published":"2023-11-28T13:25:34Z","title":"The HR-Calculus: Enabling Information Processing with Quaternion Algebra","summary":" From their inception, quaternions and their division algebra have proven to\nbe advantageous in modelling rotation/orientation in three-dimensional spaces\nand have seen use from the initial formulation of electromagnetic filed theory\nthrough to forming the basis of quantum filed theory. Despite their impressive\nversatility in modelling real-world phenomena, adaptive information processing\ntechniques specifically designed for quaternion-valued signals have only\nrecently come to the attention of the machine learning, signal processing, and\ncontrol communities. The most important development in this direction is\nintroduction of the HR-calculus, which provides the required mathematical\nfoundation for deriving adaptive information processing techniques directly in\nthe quaternion domain. In this article, the foundations of the HR-calculus are\nrevised and the required tools for deriving adaptive learning techniques\nsuitable for dealing with quaternion-valued signals, such as the gradient\noperator, chain and product derivative rules, and Taylor series expansion are\npresented. This serves to establish the most important applications of adaptive\ninformation processing in the quaternion domain for both single-node and\nmulti-node formulations. The article is supported by Supplementary Material,\nwhich will be referred to as SM.\n","authors":["Danilo P. Mandic","Sayed Pouria Talebi","Clive Cheong Took","Yili Xia","Dongpo Xu","Min Xiang","Pauline Bourigault"],"pdf_url":"https://arxiv.org/pdf/2311.16771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13029v2","updated":"2023-11-28T13:23:39Z","published":"2023-06-22T16:46:00Z","title":"Decentralized Online Federated G-Network Learning for Lightweight\n Intrusion Detection","summary":" Cyberattacks are increasingly threatening networked systems, often with the\nemergence of new types of unknown (zero-day) attacks and the rise of vulnerable\ndevices. Such attacks can also target multiple components of a Supply Chain,\nwhich can be protected via Machine Learning (ML)-based Intrusion Detection\nSystems (IDSs). However, the need to learn large amounts of labelled data often\nlimits the applicability of ML-based IDSs to cybersystems that only have access\nto private local data, while distributed systems such as Supply Chains have\nmultiple components, each of which must preserve its private data while being\ntargeted by the same attack To address this issue, this paper proposes a novel\nDecentralized and Online Federated Learning Intrusion Detection (DOF-ID)\narchitecture based on the G-Network model with collaborative learning, that\nallows each IDS used by a specific component to learn from the experience\ngained in other components, in addition to its own local data, without\nviolating the data privacy of other components. The performance evaluation\nresults using public Kitsune and Bot-IoT datasets show that DOF-ID\nsignificantly improves the intrusion detection performance in all of the\ncollaborating components, with acceptable computation time for online learning.\n","authors":["Mert Nakıp","Baran Can Gül","Erol Gelenbe"],"pdf_url":"https://arxiv.org/pdf/2306.13029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16769v1","updated":"2023-11-28T13:19:54Z","published":"2023-11-28T13:19:54Z","title":"Equilibrium in the Computing Continuum through Active Inference","summary":" Computing Continuum (CC) systems are challenged to ensure the intricate\nrequirements of each computational tier. Given the system's scale, the Service\nLevel Objectives (SLOs) which are expressed as these requirements, must be\nbroken down into smaller parts that can be decentralized. We present our\nframework for collaborative edge intelligence enabling individual edge devices\nto (1) develop a causal understanding of how to enforce their SLOs, and (2)\ntransfer knowledge to speed up the onboarding of heterogeneous devices. Through\ncollaboration, they (3) increase the scope of SLO fulfillment. We implemented\nthe framework and evaluated a use case in which a CC system is responsible for\nensuring Quality of Service (QoS) and Quality of Experience (QoE) during video\nstreaming. Our results showed that edge devices required only ten training\nrounds to ensure four SLOs; furthermore, the underlying causal structures were\nalso rationally explainable. The addition of new types of devices can be done a\nposteriori, the framework allowed them to reuse existing models, even though\nthe device type had been unknown. Finally, rebalancing the load within a device\ncluster allowed individual edge devices to recover their SLO compliance after a\nnetwork failure from 22% to 89%.\n","authors":["Boris Sedlak","Victor Casamayor Pujol","Praveen Kumar Donta","Schahram Dustdar"],"pdf_url":"https://arxiv.org/pdf/2311.16769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16766v1","updated":"2023-11-28T13:14:55Z","published":"2023-11-28T13:14:55Z","title":"Rescuing referral failures during automated diagnosis of domain-shifted\n medical images","summary":" The success of deep learning models deployed in the real world depends\ncritically on their ability to generalize well across diverse data domains.\nHere, we address a fundamental challenge with selective classification during\nautomated diagnosis with domain-shifted medical images. In this scenario,\nmodels must learn to avoid making predictions when label confidence is low,\nespecially when tested with samples far removed from the training set\n(covariate shift). Such uncertain cases are typically referred to the clinician\nfor further analysis and evaluation. Yet, we show that even state-of-the-art\ndomain generalization approaches fail severely during referral when tested on\nmedical images acquired from a different demographic or using a different\ntechnology. We examine two benchmark diagnostic medical imaging datasets\nexhibiting strong covariate shifts: i) diabetic retinopathy prediction with\nretinal fundus images and ii) multilabel disease prediction with chest X-ray\nimages. We show that predictive uncertainty estimates do not generalize well\nunder covariate shifts leading to non-monotonic referral curves, and severe\ndrops in performance (up to 50%) at high referral rates (>70%). We evaluate\nnovel combinations of robust generalization and post hoc referral approaches,\nthat rescue these failures and achieve significant performance improvements,\ntypically >10%, over baseline methods. Our study identifies a critical\nchallenge with referral in domain-shifted medical images and finds key\napplications in reliable, automated disease diagnosis.\n","authors":["Anuj Srivastava","Karm Patel","Pradeep Shenoy","Devarajan Sridharan"],"pdf_url":"https://arxiv.org/pdf/2311.16766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15243v2","updated":"2023-11-28T13:06:43Z","published":"2023-11-26T09:06:40Z","title":"ID-like Prompt Learning for Few-Shot Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection methods often exploit auxiliary outliers\nto train model identifying OOD samples, especially discovering challenging\noutliers from auxiliary outliers dataset to improve OOD detection. However,\nthey may still face limitations in effectively distinguishing between the most\nchallenging OOD samples that are much like in-distribution (ID) data, i.e.,\nID-like samples. To this end, we propose a novel OOD detection framework that\ndiscovers ID-like outliers using CLIP from the vicinity space of the ID\nsamples, thus helping to identify these most challenging OOD samples. Then a\nprompt learning framework is proposed that utilizes the identified ID-like\noutliers to further leverage the capabilities of CLIP for OOD detection.\nBenefiting from the powerful CLIP, we only need a small number of ID samples to\nlearn the prompts of the model without exposing other auxiliary outlier\ndatasets. By focusing on the most challenging ID-like OOD samples and elegantly\nexploiting the capabilities of CLIP, our method achieves superior few-shot\nlearning performance on various real-world image datasets (e.g., in 4-shot OOD\ndetection on the ImageNet-1k dataset, our method reduces the average FPR95 by\n12.16% and improves the average AUROC by 2.76%, compared to state-of-the-art\nmethods).\n","authors":["Yichen Bai","Zongbo Han","Changqing Zhang","Bing Cao","Xiaoheng Jiang","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2311.15243v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.16741v1","updated":"2023-11-28T12:39:34Z","published":"2023-11-28T12:39:34Z","title":"Asynchronous Wireless Federated Learning with Probabilistic Client\n Selection","summary":" Federated learning (FL) is a promising distributed learning framework where\ndistributed clients collaboratively train a machine learning model coordinated\nby a server. To tackle the stragglers issue in asynchronous FL, we consider\nthat each client keeps local updates and probabilistically transmits the local\nmodel to the server at arbitrary times. We first derive the (approximate)\nexpression for the convergence rate based on the probabilistic client\nselection. Then, an optimization problem is formulated to trade off the\nconvergence rate of asynchronous FL and mobile energy consumption by joint\nprobabilistic client selection and bandwidth allocation. We develop an\niterative algorithm to solve the non-convex problem globally optimally.\nExperiments demonstrate the superiority of the proposed approach compared with\nthe traditional schemes.\n","authors":["Jiarong Yang","Yuan Liu","Fangjiong Chen","Wen Chen","Changle Li"],"pdf_url":"https://arxiv.org/pdf/2311.16741v1.pdf","comment":"To appear in IEEE Transactions on Wireless Communications"},{"id":"http://arxiv.org/abs/2311.12399v2","updated":"2023-11-28T12:32:05Z","published":"2023-11-21T07:22:48Z","title":"A Survey of Graph Meets Large Language Model: Progress and Future\n Directions","summary":" Graph plays a significant role in representing and analyzing complex\nrelationships in real-world applications such as citation networks, social\nnetworks, and biological data. Recently, Large Language Models (LLMs), which\nhave achieved tremendous success in various domains, have also been leveraged\nin graph-related tasks to surpass traditional Graph Neural Networks (GNNs)\nbased methods and yield state-of-the-art performance. In this survey, we first\npresent a comprehensive review and analysis of existing methods that integrate\nLLMs with graphs. First of all, we propose a new taxonomy, which organizes\nexisting methods into three categories based on the role (i.e., enhancer,\npredictor, and alignment component) played by LLMs in graph-related tasks. Then\nwe systematically survey the representative methods along the three categories\nof the taxonomy. Finally, we discuss the remaining limitations of existing\nstudies and highlight promising avenues for future research. The relevant\npapers are summarized and will be consistently updated at:\nhttps://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.\n","authors":["Yuhan Li","Zhixun Li","Peisong Wang","Jia Li","Xiangguo Sun","Hong Cheng","Jeffrey Xu Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12399v2.pdf","comment":"Work in progress; 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.16727v1","updated":"2023-11-28T12:16:06Z","published":"2023-11-28T12:16:06Z","title":"Sluggish and Chemically-Biased Interstitial Diffusion in Concentrated\n Solid Solution Alloys: Mechanisms and Methods","summary":" Interstitial diffusion is a pivotal process that governs the phase stability\nand irradiation response of materials in non-equilibrium conditions. In this\nwork, we study sluggish and chemically-biased interstitial diffusion in Fe-Ni\nconcentrated solid solution alloys (CSAs) by combining machine learning (ML)\nand kinetic Monte Carlo (kMC), where ML is used to accurately and efficiently\npredict the migration energy barriers on-the-fly. The ML-kMC reproduces the\ndiffusivity that was reported by molecular dynamics results at high\ntemperatures. With this powerful tool, we find that the observed sluggish\ndiffusion and the \"Ni-Ni-Ni\"-biased diffusion in Fe-Ni alloys are ascribed to a\nunique \"Barrier Lock\" mechanism, whereas the \"Fe-Fe-Fe\"-biased diffusion is\ninfluenced by a \"Component Dominance\" mechanism. Inspired by the mentioned\nmechanisms, a practical AvgS-kMC method is proposed for conveniently and\nswiftly determining interstitial-mediated diffusivity by only relying on the\nmean energy barriers of migration patterns. Combining the AvgS-kMC with the\ndifferential evolutionary algorithm, an inverse design strategy for optimizing\nsluggish diffusion properties is applied to emphasize the crucial role of\nfavorable migration patterns.\n","authors":["Biao Xu","Haijun Fu","Shasha Huang","Shihua Ma","Yaoxu Xiong","Jun Zhang","Xuepeng Xiang","Wenyu Lu","Ji-Jung Kai","Shijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.16727v1.pdf","comment":"30 pages,9 figures"},{"id":"http://arxiv.org/abs/2311.16711v1","updated":"2023-11-28T11:45:35Z","published":"2023-11-28T11:45:35Z","title":"LEDITS++: Limitless Image Editing using Text-to-Image Models","summary":" Text-to-image diffusion models have recently received increasing interest for\ntheir astonishing ability to produce high-fidelity images from solely text\ninputs. Subsequent research efforts aim to exploit and apply their capabilities\nto real image editing. However, existing image-to-image methods are often\ninefficient, imprecise, and of limited versatility. They either require\ntime-consuming fine-tuning, deviate unnecessarily strongly from the input\nimage, and/or lack support for multiple, simultaneous edits. To address these\nissues, we introduce LEDITS++, an efficient yet versatile and precise textual\nimage manipulation technique. LEDITS++'s novel inversion approach requires no\ntuning nor optimization and produces high-fidelity results with a few diffusion\nsteps. Second, our methodology supports multiple simultaneous edits and is\narchitecture-agnostic. Third, we use a novel implicit masking technique that\nlimits changes to relevant image regions. We propose the novel TEdBench++\nbenchmark as part of our exhaustive evaluation. Our results demonstrate the\ncapabilities of LEDITS++ and its improvements over previous methods. The\nproject page is available at https://leditsplusplus-project.static.hf.space .\n","authors":["Manuel Brack","Felix Friedrich","Katharina Kornmeier","Linoy Tsaban","Patrick Schramowski","Kristian Kersting","Apolinário Passos"],"pdf_url":"https://arxiv.org/pdf/2311.16711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16706v1","updated":"2023-11-28T11:29:12Z","published":"2023-11-28T11:29:12Z","title":"Sinkhorn Flow: A Continuous-Time Framework for Understanding and\n Generalizing the Sinkhorn Algorithm","summary":" Many problems in machine learning can be formulated as solving\nentropy-regularized optimal transport on the space of probability measures. The\ncanonical approach involves the Sinkhorn iterates, renowned for their rich\nmathematical properties. Recently, the Sinkhorn algorithm has been recast\nwithin the mirror descent framework, thus benefiting from classical\noptimization theory insights. Here, we build upon this result by introducing a\ncontinuous-time analogue of the Sinkhorn algorithm. This perspective allows us\nto derive novel variants of Sinkhorn schemes that are robust to noise and bias.\nMoreover, our continuous-time dynamics not only generalize but also offer a\nunified perspective on several recently discovered dynamics in machine learning\nand mathematics, such as the \"Wasserstein mirror flow\" of (Deb et al. 2023) or\nthe \"mean-field Schr\\\"odinger equation\" of (Claisse et al. 2023).\n","authors":["Mohammad Reza Karimi","Ya-Ping Hsieh","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2311.16706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16700v1","updated":"2023-11-28T11:22:08Z","published":"2023-11-28T11:22:08Z","title":"Rethinking Intermediate Layers design in Knowledge Distillation for\n Kidney and Liver Tumor Segmentation","summary":" Knowledge distillation(KD) has demonstrated remarkable success across various\ndomains, but its application to medical imaging tasks, such as kidney and liver\ntumor segmentation, has encountered challenges. Many existing KD methods are\nnot specifically tailored for these tasks. Moreover, prevalent KD methods often\nlack a careful consideration of what and from where to distill knowledge from\nthe teacher to the student. This oversight may lead to issues like the\naccumulation of training bias within shallower student layers, potentially\ncompromising the effectiveness of KD. To address these challenges, we propose\nHierarchical Layer-selective Feedback Distillation (HLFD). HLFD strategically\ndistills knowledge from a combination of middle layers to earlier layers and\ntransfers final layer knowledge to intermediate layers at both the feature and\npixel levels. This design allows the model to learn higher-quality\nrepresentations from earlier layers, resulting in a robust and compact student\nmodel. Extensive quantitative evaluations reveal that HLFD outperforms existing\nmethods by a significant margin. For example, in the kidney segmentation task,\nHLFD surpasses the student model (without KD) by over 10pp, significantly\nimproving its focus on tumor-specific features. From a qualitative standpoint,\nthe student model trained using HLFD excels at suppressing irrelevant\ninformation and can focus sharply on tumor-specific details, which opens a new\npathway for more efficient and accurate diagnostic tools.\n","authors":["Vandan Gorade","Sparsh Mittal","Debesh Jha","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2311.16700v1.pdf","comment":"Under-review at ISBI-2024"},{"id":"http://arxiv.org/abs/2306.07745v2","updated":"2023-11-28T11:11:54Z","published":"2023-06-13T13:01:42Z","title":"Kernelized Reinforcement Learning with Order Optimal Regret Bounds","summary":" Reinforcement learning (RL) has shown empirical success in various real world\nsettings with complex models and large state-action spaces. The existing\nanalytical results, however, typically focus on settings with a small number of\nstate-actions or simple models such as linearly modeled state-action value\nfunctions. To derive RL policies that efficiently handle large state-action\nspaces with more general value functions, some recent works have considered\nnonlinear function approximation using kernel ridge regression. We propose\n$\\pi$-KRVI, an optimistic modification of least-squares value iteration, when\nthe state-action value function is represented by a reproducing kernel Hilbert\nspace (RKHS). We prove the first order-optimal regret guarantees under a\ngeneral setting. Our results show a significant polynomial in the number of\nepisodes improvement over the state of the art. In particular, with highly\nnon-smooth kernels (such as Neural Tangent kernel or some Mat\\'ern kernels) the\nexisting results lead to trivial (superlinear in the number of episodes) regret\nbounds. We show a sublinear regret bound that is order optimal in the case of\nMat\\'ern kernels where a lower bound on regret is known.\n","authors":["Sattar Vakili","Julia Olkhovskaya"],"pdf_url":"https://arxiv.org/pdf/2306.07745v2.pdf","comment":"Advances in Neural Information Processing Systems (NeurIPS)"},{"id":"http://arxiv.org/abs/2307.12689v2","updated":"2023-11-28T10:59:01Z","published":"2023-07-24T11:04:22Z","title":"Addressing the Impact of Localized Training Data in Graph Neural\n Networks","summary":" Graph Neural Networks (GNNs) have achieved notable success in learning from\ngraph-structured data, owing to their ability to capture intricate dependencies\nand relationships between nodes. They excel in various applications, including\nsemi-supervised node classification, link prediction, and graph generation.\nHowever, it is important to acknowledge that the majority of state-of-the-art\nGNN models are built upon the assumption of an in-distribution setting, which\nhinders their performance on real-world graphs with dynamic structures. In this\narticle, we aim to assess the impact of training GNNs on localized subsets of\nthe graph. Such restricted training data may lead to a model that performs well\nin the specific region it was trained on but fails to generalize and make\naccurate predictions for the entire graph. In the context of graph-based\nsemi-supervised learning (SSL), resource constraints often lead to scenarios\nwhere the dataset is large, but only a portion of it can be labeled, affecting\nthe model's performance. This limitation affects tasks like anomaly detection\nor spam detection when labeling processes are biased or influenced by human\nsubjectivity. To tackle the challenges posed by localized training data, we\napproach the problem as an out-of-distribution (OOD) data issue by by aligning\nthe distributions between the training data, which represents a small portion\nof labeled data, and the graph inference process that involves making\npredictions for the entire graph. We propose a regularization method to\nminimize distributional discrepancies between localized training data and graph\ninference, improving model performance on OOD data. Extensive tests on popular\nGNN models show significant performance improvement on three citation GNN\nbenchmark datasets. The regularization approach effectively enhances model\nadaptation and generalization, overcoming challenges posed by OOD data.\n","authors":["Akansha A"],"pdf_url":"https://arxiv.org/pdf/2307.12689v2.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.16683v1","updated":"2023-11-28T10:55:00Z","published":"2023-11-28T10:55:00Z","title":"Hyper-Relational Knowledge Graph Neural Network for Next POI","summary":" With the advancement of mobile technology, Point of Interest (POI)\nrecommendation systems in Location-based Social Networks (LBSN) have brought\nnumerous benefits to both users and companies. Many existing works employ\nKnowledge Graph (KG) to alleviate the data sparsity issue in LBSN. These\napproaches primarily focus on modeling the pair-wise relations in LBSN to\nenrich the semantics and thereby relieve the data sparsity issue. However,\nexisting approaches seldom consider the hyper-relations in LBSN, such as the\nmobility relation (a 3-ary relation: user-POI-time). This makes the model hard\nto exploit the semantics accurately. In addition, prior works overlook the rich\nstructural information inherent in KG, which consists of higher-order relations\nand can further alleviate the impact of data sparsity.To this end, we propose a\nHyper-Relational Knowledge Graph Neural Network (HKGNN) model. In HKGNN, a\nHyper-Relational Knowledge Graph (HKG) that models the LBSN data is constructed\nto maintain and exploit the rich semantics of hyper-relations. Then we proposed\na Hypergraph Neural Network to utilize the structural information of HKG in a\ncohesive way. In addition, a self-attention network is used to leverage\nsequential information and make personalized recommendations. Furthermore, side\ninformation, essential in reducing data sparsity by providing background\nknowledge of POIs, is not fully utilized in current methods. In light of this,\nwe extended the current dataset with available side information to further\nlessen the impact of data sparsity. Results of experiments on four real-world\nLBSN datasets demonstrate the effectiveness of our approach compared to\nexisting state-of-the-art methods.\n","authors":["Jixiao Zhang","Yongkang Li","Ruotong Zou","Jingyuan Zhang","Zipei Fan","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2311.16683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11526v2","updated":"2023-11-28T10:47:37Z","published":"2023-09-20T06:55:39Z","title":"Likelihood-based Sensor Calibration using Affine Transformation","summary":" An important task in the field of sensor technology is the efficient\nimplementation of adaptation procedures of measurements from one sensor to\nanother sensor of identical design. One idea is to use the estimation of an\naffine transformation between different systems, which can be improved by the\nknowledge of experts. This paper presents an improved solution from Glacier\nResearch that was published back in 1973. The results demonstrate the\nadaptability of this solution for various applications, including software\ncalibration of sensors, implementation of expert-based adaptation, and paving\nthe way for future advancements such as distributed learning methods. One idea\nhere is to use the knowledge of experts for estimating an affine transformation\nbetween different systems. We evaluate our research with simulations and also\nwith real measured data of a multi-sensor board with 8 identical sensors. Both\ndata set and evaluation script are provided for download. The results show an\nimprovement for both the simulation and the experiments with real data.\n","authors":["Rüdiger Machhamer","Lejla Begic Fazlic","Eray Guven","David Junk","Gunes Karabulut Kurt","Stefan Naumann","Stephan Didas","Klaus-Uwe Gollmer","Ralph Bergmann","Ingo J. Timm","Guido Dartmann"],"pdf_url":"https://arxiv.org/pdf/2309.11526v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10099v2","updated":"2023-11-28T10:35:06Z","published":"2023-08-19T20:10:54Z","title":"Geometric instability of graph neural networks on large graphs","summary":" We analyse the geometric instability of embeddings produced by graph neural\nnetworks (GNNs). Existing methods are only applicable for small graphs and lack\ncontext in the graph domain. We propose a simple, efficient and graph-native\nGraph Gram Index (GGI) to measure such instability which is invariant to\npermutation, orthogonal transformation, translation and order of evaluation.\nThis allows us to study the varying instability behaviour of GNN embeddings on\nlarge graphs for both node classification and link prediction.\n","authors":["Emily Morris","Haotian Shen","Weiling Du","Muhammad Hamza Sajjad","Borun Shi"],"pdf_url":"https://arxiv.org/pdf/2308.10099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16670v1","updated":"2023-11-28T10:34:48Z","published":"2023-11-28T10:34:48Z","title":"PyTorch Geometric High Order: A Unified Library for High Order Graph\n Neural Network","summary":" We introduce PyTorch Geometric High Order (PyGHO), a library for High Order\nGraph Neural Networks (HOGNNs) that extends PyTorch Geometric (PyG). Unlike\nordinary Message Passing Neural Networks (MPNNs) that exchange messages between\nnodes, HOGNNs, encompassing subgraph GNNs and k-WL GNNs, encode node tuples, a\nmethod previously lacking a standardized framework and often requiring complex\ncoding. PyGHO's main objective is to provide an unified and user-friendly\ninterface for various HOGNNs. It accomplishes this through streamlined data\nstructures for node tuples, comprehensive data processing utilities, and a\nflexible suite of operators for high-order GNN methodologies. In this work, we\npresent a detailed in-depth of PyGHO and compare HOGNNs implemented with PyGHO\nwith their official implementation on real-world tasks. PyGHO achieves up to\n$50\\%$ acceleration and reduces the code needed for implementation by an order\nof magnitude. Our library is available at\n\\url{https://github.com/GraphPKU/PygHO}.\n","authors":["Xiyuan Wang","Muhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.16670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16666v1","updated":"2023-11-28T10:28:35Z","published":"2023-11-28T10:28:35Z","title":"MultiModal-Learning for Predicting Molecular Properties: A Framework\n Based on Image and Graph Structures","summary":" The quest for accurate prediction of drug molecule properties poses a\nfundamental challenge in the realm of Artificial Intelligence Drug Discovery\n(AIDD). An effective representation of drug molecules emerges as a pivotal\ncomponent in this pursuit. Contemporary leading-edge research predominantly\nresorts to self-supervised learning (SSL) techniques to extract meaningful\nstructural representations from large-scale, unlabeled molecular data,\nsubsequently fine-tuning these representations for an array of downstream\ntasks. However, an inherent shortcoming of these studies lies in their singular\nreliance on one modality of molecular information, such as molecule image or\nSMILES representations, thus neglecting the potential complementarity of\nvarious molecular modalities. In response to this limitation, we propose MolIG,\na novel MultiModaL molecular pre-training framework for predicting molecular\nproperties based on Image and Graph structures. MolIG model innovatively\nleverages the coherence and correlation between molecule graph and molecule\nimage to execute self-supervised tasks, effectively amalgamating the strengths\nof both molecular representation forms. This holistic approach allows for the\ncapture of pivotal molecular structural characteristics and high-level semantic\ninformation. Upon completion of pre-training, Graph Neural Network (GNN)\nEncoder is used for the prediction of downstream tasks. In comparison to\nadvanced baseline models, MolIG exhibits enhanced performance in downstream\ntasks pertaining to molecular property prediction within benchmark groups such\nas MoleculeNet Benchmark Group and ADMET Benchmark Group.\n","authors":["Zhuoyuan Wang","Jiacong Mi","Shan Lu","Jieyue He"],"pdf_url":"https://arxiv.org/pdf/2311.16666v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2212.04548v2","updated":"2023-11-28T10:20:09Z","published":"2022-12-08T20:24:59Z","title":"STLGRU: Spatio-Temporal Lightweight Graph GRU for Traffic Flow\n Prediction","summary":" Reliable forecasting of traffic flow requires efficient modeling of traffic\ndata. Different correlations and influences arise in a dynamic traffic network,\nmaking modeling a complicated task. Existing literature has proposed many\ndifferent methods to capture the complex underlying spatial-temporal relations\nof traffic networks. However, methods still struggle to capture different local\nand global dependencies of long-range nature. Also, as more and more\nsophisticated methods are being proposed, models are increasingly becoming\nmemory-heavy and, thus, unsuitable for low-powered devices. In this paper, we\nfocus on solving these problems by proposing a novel deep learning framework -\nSTLGRU. Specifically, our proposed STLGRU can effectively capture both local\nand global spatial-temporal relations of a traffic network using\nmemory-augmented attention and gating mechanism. Instead of employing separate\ntemporal and spatial components, we show that our memory module and gated unit\ncan learn the spatial-temporal dependencies successfully, allowing for reduced\nmemory usage with fewer parameters. We extensively experiment on several\nreal-world traffic prediction datasets to show that our model performs better\nthan existing methods while the memory footprint remains lower. Code is\navailable at \\url{https://github.com/Kishor-Bhaumik/STLGRU}.\n","authors":["Kishor Kumar Bhaumik","Fahim Faisal Niloy","Saif Mahmud","Simon Woo"],"pdf_url":"https://arxiv.org/pdf/2212.04548v2.pdf","comment":"We withdraw for now and shall further work on the manuscript and\n upload it again"},{"id":"http://arxiv.org/abs/2311.16656v1","updated":"2023-11-28T10:17:52Z","published":"2023-11-28T10:17:52Z","title":"Pseudo-Likelihood Inference","summary":" Simulation-Based Inference (SBI) is a common name for an emerging family of\napproaches that infer the model parameters when the likelihood is intractable.\nExisting SBI methods either approximate the likelihood, such as Approximate\nBayesian Computation (ABC) or directly model the posterior, such as Sequential\nNeural Posterior Estimation (SNPE). While ABC is efficient on low-dimensional\nproblems, on higher-dimensional tasks, it is generally outperformed by SNPE,\nwhich leverages function approximation. In this paper, we propose\nPseudo-Likelihood Inference (PLI), a new method that brings neural\napproximation into ABC, making it competitive on challenging Bayesian system\nidentification tasks. By utilizing integral probability metrics, we introduce a\nsmooth likelihood kernel with an adaptive bandwidth that is updated based on\ninformation-theoretic trust regions. Thanks to this formulation, our method (i)\nallows for optimizing neural posteriors via gradient descent, (ii) does not\nrely on summary statistics, and (iii) enables multiple observations as input.\nIn comparison to SNPE, it leads to improved performance when more data is\navailable. The effectiveness of PLI is evaluated on four classical SBI\nbenchmark tasks and on a highly dynamic physical system, showing particular\nadvantages on stochastic simulations and multi-modal posterior landscapes.\n","authors":["Theo Gruner","Boris Belousov","Fabio Muratore","Daniel Palenicek","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2311.16656v1.pdf","comment":"27 pages, 12 figures, Published as a conference paper at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.16654v1","updated":"2023-11-28T10:13:31Z","published":"2023-11-28T10:13:31Z","title":"Elucidating Discrepancy in Explanations of Predictive Models Developed\n using EMR","summary":" The lack of transparency and explainability hinders the clinical adoption of\nMachine learning (ML) algorithms. While explainable artificial intelligence\n(XAI) methods have been proposed, little research has focused on the agreement\nbetween these methods and expert clinical knowledge. This study applies current\nstate-of-the-art explainability methods to clinical decision support algorithms\ndeveloped for Electronic Medical Records (EMR) data to analyse the concordance\nbetween these factors and discusses causes for identified discrepancies from a\nclinical and technical perspective. Important factors for achieving trustworthy\nXAI solutions for clinical decision support are also discussed.\n","authors":["Aida Brankovic","Wenjie Huang","David Cook","Sankalp Khanna","Konstanty Bialkowski"],"pdf_url":"https://arxiv.org/pdf/2311.16654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14971v2","updated":"2023-11-28T10:08:35Z","published":"2023-11-25T09:08:30Z","title":"Segmentation of diagnostic tissue compartments on whole slide images\n with renal thrombotic microangiopathies (TMAs)","summary":" The thrombotic microangiopathies (TMAs) manifest in renal biopsy histology\nwith a broad spectrum of acute and chronic findings. Precise diagnostic\ncriteria for a renal biopsy diagnosis of TMA are missing. As a first step\ntowards a machine learning- and computer vision-based analysis of wholes slide\nimages from renal biopsies, we trained a segmentation model for the decisive\ndiagnostic kidney tissue compartments artery, arteriole, glomerulus on a set of\nwhole slide images from renal biopsies with TMAs and Mimickers (distinct\ndiseases with a similar nephropathological appearance as TMA like severe benign\nnephrosclerosis, various vasculitides, Bevacizumab-plug glomerulopathy,\narteriolar light chain deposition disease). Our segmentation model combines a\nU-Net-based tissue detection with a Shifted windows-transformer architecture to\nreach excellent segmentation results for even the most severely altered\nglomeruli, arterioles and arteries, even on unseen staining domains from a\ndifferent nephropathology lab. With accurate automatic segmentation of the\ndecisive renal biopsy compartments in human renal vasculopathies, we have laid\nthe foundation for large-scale compartment-specific machine learning and\ncomputer vision analysis of renal biopsy repositories with TMAs.\n","authors":["Huy Q. Vo","Pietro A. Cicalese","Surya Seshan","Syed A. Rizvi","Aneesh Vathul","Gloria Bueno","Anibal Pedraza Dorado","Niels Grabe","Katharina Stolle","Francesco Pesce","Joris J. T. H. Roelofs","Jesper Kers","Vitoantonio Bevilacqua","Nicola Altini","Bernd Schröppel","Dario Roccatello","Antonella Barreca","Savino Sciascia","Chandra Mohan","Hien V. Nguyen","Jan U. Becker"],"pdf_url":"https://arxiv.org/pdf/2311.14971v2.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.16646v1","updated":"2023-11-28T09:53:05Z","published":"2023-11-28T09:53:05Z","title":"Rethinking Backdoor Attacks on Dataset Distillation: A Kernel Method\n Perspective","summary":" Dataset distillation offers a potential means to enhance data efficiency in\ndeep learning. Recent studies have shown its ability to counteract backdoor\nrisks present in original training samples. In this study, we delve into the\ntheoretical aspects of backdoor attacks and dataset distillation based on\nkernel methods. We introduce two new theory-driven trigger pattern generation\nmethods specialized for dataset distillation. Following a comprehensive set of\nanalyses and experiments, we show that our optimization-based trigger design\nframework informs effective backdoor attacks on dataset distillation. Notably,\ndatasets poisoned by our designed trigger prove resilient against conventional\nbackdoor attack detection and mitigation methods. Our empirical results\nvalidate that the triggers developed using our approaches are proficient at\nexecuting resilient backdoor attacks.\n","authors":["Ming-Yu Chung","Sheng-Yen Chou","Chia-Mu Yu","Pin-Yu Chen","Sy-Yen Kuo","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2311.16646v1.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.16632v1","updated":"2023-11-28T09:34:44Z","published":"2023-11-28T09:34:44Z","title":"Opening the Black Box: Towards inherently interpretable energy data\n imputation models using building physics insight","summary":" Missing data are frequently observed by practitioners and researchers in the\nbuilding energy modeling community. In this regard, advanced data-driven\nsolutions, such as Deep Learning methods, are typically required to reflect the\nnon-linear behavior of these anomalies. As an ongoing research question related\nto Deep Learning, a model's applicability to limited data settings can be\nexplored by introducing prior knowledge in the network. This same strategy can\nalso lead to more interpretable predictions, hence facilitating the field\napplication of the approach. For that purpose, the aim of this paper is to\npropose the use of Physics-informed Denoising Autoencoders (PI-DAE) for missing\ndata imputation in commercial buildings. In particular, the presented method\nenforces physics-inspired soft constraints to the loss function of a Denoising\nAutoencoder (DAE). In order to quantify the benefits of the physical component,\nan ablation study between different DAE configurations is conducted. First,\nthree univariate DAEs are optimized separately on indoor air temperature,\nheating, and cooling data. Then, two multivariate DAEs are derived from the\nprevious configurations. Eventually, a building thermal balance equation is\ncoupled to the last multivariate configuration to obtain PI-DAE. Additionally,\ntwo commonly used benchmarks are employed to support the findings. It is shown\nhow introducing physical knowledge in a multivariate Denoising Autoencoder can\nenhance the inherent model interpretability through the optimized physics-based\ncoefficients. While no significant improvement is observed in terms of\nreconstruction error with the proposed PI-DAE, its enhanced robustness to\nvarying rates of missing data and the valuable insights derived from the\nphysics-based coefficients create opportunities for wider applications within\nbuilding systems and the built environment.\n","authors":["Antonio Liguori","Matias Quintana","Chun Fu","Clayton Miller","Jérôme Frisch","Christoph van Treeck"],"pdf_url":"https://arxiv.org/pdf/2311.16632v1.pdf","comment":"Under review in Energy and Buildings"},{"id":"http://arxiv.org/abs/2311.16630v1","updated":"2023-11-28T09:30:52Z","published":"2023-11-28T09:30:52Z","title":"Outfit Completion via Conditional Set Transformation","summary":" In this paper, we formulate the outfit completion problem as a set retrieval\ntask and propose a novel framework for solving this problem. The proposal\nincludes a conditional set transformation architecture with deep neural\nnetworks and a compatibility-based regularization method. The proposed method\nutilizes a map with permutation-invariant for the input set and\npermutation-equivariant for the condition set. This allows retrieving a set\nthat is compatible with the input set while reflecting the properties of the\ncondition set. In addition, since this structure outputs the element of the\noutput set in a single inference, it can achieve a scalable inference speed\nwith respect to the cardinality of the output set. Experimental results on real\ndata reveal that the proposed method outperforms existing approaches in terms\nof accuracy of the outfit completion task, condition satisfaction, and\ncompatibility of completion results.\n","authors":["Takuma Nakamura","Yuki Saito","Ryosuke Goto"],"pdf_url":"https://arxiv.org/pdf/2311.16630v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.16628v1","updated":"2023-11-28T09:27:44Z","published":"2023-11-28T09:27:44Z","title":"Symmetry-regularized neural ordinary differential equations","summary":" Neural Ordinary Differential Equations (Neural ODEs) is a class of deep\nneural network models that interpret the hidden state dynamics of neural\nnetworks as an ordinary differential equation, thereby capable of capturing\nsystem dynamics in a continuous time framework. In this work, I integrate\nsymmetry regularization into Neural ODEs. In particular, I use continuous Lie\nsymmetry of ODEs and PDEs associated with the model to derive conservation laws\nand add them to the loss function, making it physics-informed. This\nincorporation of inherent structural properties into the loss function could\nsignificantly improve robustness and stability of the model during training. To\nillustrate this method, I employ a toy model that utilizes a cosine rate of\nchange in the hidden state, showcasing the process of identifying Lie\nsymmetries, deriving conservation laws, and constructing a new loss function.\n","authors":["Wenbo Hao"],"pdf_url":"https://arxiv.org/pdf/2311.16628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16625v1","updated":"2023-11-28T09:25:23Z","published":"2023-11-28T09:25:23Z","title":"Gaussian Processes for Monitoring Air-Quality in Kampala","summary":" Monitoring air pollution is of vital importance to the overall health of the\npopulation. Unfortunately, devices that can measure air quality can be\nexpensive, and many cities in low and middle-income countries have to rely on a\nsparse allocation of them. In this paper, we investigate the use of Gaussian\nProcesses for both nowcasting the current air-pollution in places where there\nare no sensors and forecasting the air-pollution in the future at the sensor\nlocations. In particular, we focus on the city of Kampala in Uganda, using data\nfrom AirQo's network of sensors. We demonstrate the advantage of removing\noutliers, compare different kernel functions and additional inputs. We also\ncompare two sparse approximations to allow for the large amounts of temporal\ndata in the dataset.\n","authors":["Clara Stoddart","Lauren Shrack","Richard Sserunjogi","Usman Abdul-Ganiy","Engineer Bainomugisha","Deo Okure","Ruth Misener","Jose Pablo Folch","Ruby Sedgwick"],"pdf_url":"https://arxiv.org/pdf/2311.16625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07221v3","updated":"2023-11-28T09:23:51Z","published":"2023-02-14T17:51:00Z","title":"On the Role of Randomization in Adversarially Robust Classification","summary":" Deep neural networks are known to be vulnerable to small adversarial\nperturbations in test data. To defend against adversarial attacks,\nprobabilistic classifiers have been proposed as an alternative to deterministic\nones. However, literature has conflicting findings on the effectiveness of\nprobabilistic classifiers in comparison to deterministic ones. In this paper,\nwe clarify the role of randomization in building adversarially robust\nclassifiers. Given a base hypothesis set of deterministic classifiers, we show\nthe conditions under which a randomized ensemble outperforms the hypothesis set\nin adversarial risk, extending previous results. Additionally, we show that for\nany probabilistic binary classifier (including randomized ensembles), there\nexists a deterministic classifier that outperforms it. Finally, we give an\nexplicit description of the deterministic hypothesis set that contains such a\ndeterministic classifier for many types of commonly used probabilistic\nclassifiers, i.e. randomized ensembles and parametric/input noise injection.\n","authors":["Lucas Gnecco-Heredia","Yann Chevaleyre","Benjamin Negrevergne","Laurent Meunier","Muni Sreenivas Pydi"],"pdf_url":"https://arxiv.org/pdf/2302.07221v3.pdf","comment":"10 pages main paper (27 total), 2 figures in main paper. Neurips 2023"},{"id":"http://arxiv.org/abs/2311.16621v1","updated":"2023-11-28T09:22:17Z","published":"2023-11-28T09:22:17Z","title":"Beyond Labels: Advancing Cluster Analysis with the Entropy of Distance\n Distribution (EDD)","summary":" In the evolving landscape of data science, the accurate quantification of\nclustering in high-dimensional data sets remains a significant challenge,\nespecially in the absence of predefined labels. This paper introduces a novel\napproach, the Entropy of Distance Distribution (EDD), which represents a\nparadigm shift in label-free clustering analysis. Traditional methods, reliant\non discrete labels, often struggle to discern intricate cluster patterns in\nunlabeled data. EDD, however, leverages the characteristic differences in\npairwise point-to-point distances to discern clustering tendencies, independent\nof data labeling.\n Our method employs the Shannon information entropy to quantify the\n'peakedness' or 'flatness' of distance distributions in a data set. This\nentropy measure, normalized against its maximum value, effectively\ndistinguishes between strongly clustered data (indicated by pronounced peaks in\ndistance distribution) and more homogeneous, non-clustered data sets. This\nlabel-free quantification is resilient against global translations and\npermutations of data points, and with an additional dimension-wise z-scoring,\nit becomes invariant to data set scaling.\n We demonstrate the efficacy of EDD through a series of experiments involving\ntwo-dimensional data spaces with Gaussian cluster centers. Our findings reveal\na monotonic increase in the EDD value with the widening of cluster widths,\nmoving from well-separated to overlapping clusters. This behavior underscores\nthe method's sensitivity and accuracy in detecting varying degrees of\nclustering. EDD's potential extends beyond conventional clustering analysis,\noffering a robust, scalable tool for unraveling complex data structures without\nreliance on pre-assigned labels.\n","authors":["Claus Metzner","Achim Schilling","Patrick Krauss"],"pdf_url":"https://arxiv.org/pdf/2311.16621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16620v1","updated":"2023-11-28T09:21:48Z","published":"2023-11-28T09:21:48Z","title":"On the Long Range Abilities of Transformers","summary":" Despite their dominance in modern DL and, especially, NLP domains,\ntransformer architectures exhibit sub-optimal performance on long-range tasks\ncompared to recent layers that are specifically designed for this purpose. In\nthis work, drawing inspiration from key attributes of long-range layers, such\nas state-space layers, linear RNN layers, and global convolution layers, we\ndemonstrate that minimal modifications to the transformer architecture can\nsignificantly enhance performance on the Long Range Arena (LRA) benchmark, thus\nnarrowing the gap with these specialized layers. We identify that two key\nprinciples for long-range tasks are (i) incorporating an inductive bias towards\nsmoothness, and (ii) locality. As we show, integrating these ideas into the\nattention mechanism improves results with a negligible amount of additional\ncomputation and without any additional trainable parameters. Our theory and\nexperiments also shed light on the reasons for the inferior performance of\ntransformers on long-range tasks and identify critical properties that are\nessential for successfully capturing long-range dependencies.\n","authors":["Itamar Zimerman","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2311.16620v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2311.16616v1","updated":"2023-11-28T09:12:37Z","published":"2023-11-28T09:12:37Z","title":"Adversarial Distribution Balancing for Counterfactual Reasoning","summary":" The development of causal prediction models is challenged by the fact that\nthe outcome is only observable for the applied (factual) intervention and not\nfor its alternatives (the so-called counterfactuals); in medicine we only know\npatients' survival for the administered drug and not for other therapeutic\noptions. Machine learning approaches for counterfactual reasoning have to deal\nwith both unobserved outcomes and distributional differences due to non-random\ntreatment administration. Unsupervised domain adaptation (UDA) addresses\nsimilar issues; one has to deal with unobserved outcomes -- the labels of the\ntarget domain -- and distributional differences between source and target\ndomain. We propose Adversarial Distribution Balancing for Counterfactual\nReasoning (ADBCR), which directly uses potential outcome estimates of the\ncounterfactuals to remove spurious causal relations. We show that ADBCR\noutcompetes state-of-the-art methods on three benchmark datasets, and\ndemonstrate that ADBCR's performance can be further improved if unlabeled\nvalidation data are included in the training procedure to better adapt the\nmodel to the validation domain.\n","authors":["Stefan Schrod","Fabian Sinz","Michael Altenbuchinger"],"pdf_url":"https://arxiv.org/pdf/2311.16616v1.pdf","comment":"Implementation available at https://github.com/sschrod/ADBCR"},{"id":"http://arxiv.org/abs/2311.16614v1","updated":"2023-11-28T09:11:02Z","published":"2023-11-28T09:11:02Z","title":"A Multivariate Unimodality Test Harnenssing the Dip Statistic of\n Mahalanobis Distances Over Random Projections","summary":" Unimodality, pivotal in statistical analysis, offers insights into dataset\nstructures and drives sophisticated analytical procedures. While unimodality's\nconfirmation is straightforward for one-dimensional data using methods like\nSilverman's approach and Hartigans' dip statistic, its generalization to higher\ndimensions remains challenging. By extrapolating one-dimensional unimodality\nprinciples to multi-dimensional spaces through linear random projections and\nleveraging point-to-point distancing, our method, rooted in\n$\\alpha$-unimodality assumptions, presents a novel multivariate unimodality\ntest named mud-pod. Both theoretical and empirical studies confirm the efficacy\nof our method in unimodality assessment of multidimensional datasets as well as\nin estimating the number of clusters.\n","authors":["Prodromos Kolyvakis","Aristidis Likas"],"pdf_url":"https://arxiv.org/pdf/2311.16614v1.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2311.16609v1","updated":"2023-11-28T08:54:29Z","published":"2023-11-28T08:54:29Z","title":"Eigenmatrix for unstructured sparse recovery","summary":" This paper considers the unstructured sparse recovery problems in a general\nform. Examples include rational approximation, spectral function estimation,\nFourier inversion, Laplace inversion, and sparse deconvolution. The main\nchallenges are the noise in the sample values and the unstructured nature of\nthe sample locations. This paper proposes the eigenmatrix, a data-driven\nconstruction with desired approximate eigenvalues and eigenvectors. The\neigenmatrix offers a new way for these sparse recovery problems. Numerical\nresults are provided to demonstrate the efficiency of the proposed method.\n","authors":["Lexing Ying"],"pdf_url":"https://arxiv.org/pdf/2311.16609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02691v2","updated":"2023-11-28T08:54:13Z","published":"2023-10-04T10:02:49Z","title":"Robust Ocean Subgrid-Scale Parameterizations Using Fourier Neural\n Operators","summary":" In climate simulations, small-scale processes shape ocean dynamics but remain\ncomputationally expensive to resolve directly. For this reason, their\ncontributions are commonly approximated using empirical parameterizations,\nwhich lead to significant errors in long-term projections. In this work, we\ndevelop parameterizations based on Fourier Neural Operators, showcasing their\naccuracy and generalizability in comparison to other approaches. Finally, we\ndiscuss the potential and limitations of neural networks operating in the\nfrequency domain, paving the way for future investigation.\n","authors":["Victor Mangeleer","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2310.02691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16605v1","updated":"2023-11-28T08:45:37Z","published":"2023-11-28T08:45:37Z","title":"LasTGL: An Industrial Framework for Large-Scale Temporal Graph Learning","summary":" Over the past few years, graph neural networks (GNNs) have become powerful\nand practical tools for learning on (static) graph-structure data. However,\nmany real-world applications, such as social networks and e-commerce, involve\ntemporal graphs where nodes and edges are dynamically evolving. Temporal graph\nneural networks (TGNNs) have progressively emerged as an extension of GNNs to\naddress time-evolving graphs and have gradually become a trending research\ntopic in both academics and industry. Advancing research in such an emerging\nfield requires new tools to compose TGNN models and unify their different\nschemes in dealing with temporal graphs. To facilitate research and application\nin temporal graph learning, we introduce LasTGL, an industrial framework that\nintegrates unified and extensible implementations of common temporal graph\nlearning algorithms for various advanced tasks. The purpose of LasTGL is to\nprovide the essential building blocks for solving temporal graph learning\ntasks, focusing on the guiding principles of user-friendliness and quick\nprototyping on which PyTorch is based. In particular, LasTGL provides\ncomprehensive temporal graph datasets, TGNN models and utilities along with\nwell-documented tutorials, making it suitable for both absolute beginners and\nexpert deep learning practitioners alike.\n","authors":["Jintang Li","Jiawang Dan","Ruofan Wu","Jing Zhou","Sheng Tian","Yunfei Liu","Baokun Wang","Changhua Meng","Weiqiang Wang","Yuchang Zhu","Liang Chen","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2311.16605v1.pdf","comment":"Preprint; Work in progress"},{"id":"http://arxiv.org/abs/2310.08165v2","updated":"2023-11-28T08:45:13Z","published":"2023-10-12T09:37:56Z","title":"COVID-19 detection using ViT transformer-based approach from Computed\n Tomography Images","summary":" In here, we introduce a novel approach to enhance the accuracy and efficiency\nof COVID-19 diagnosis using CT images. Leveraging state-of-the-art Transformer\nmodels in computer vision, we employed the base ViT Transformer configured for\n224x224-sized input images, modifying the output to suit the binary\nclassification task. Notably, input images were resized from the standard CT\nscan size of 512x512 to match the model's expectations. Our method implements a\nsystematic patient-level prediction strategy, classifying individual CT slices\nas COVID-19 or non-COVID. To determine the overall diagnosis for each patient,\na majority voting approach as well as other thresholding approaches were\nemployed. This method involves evaluating all CT slices for a given patient and\nassigning the patient the diagnosis that relates to the thresholding for the CT\nscan. This meticulous patient-level prediction process contributes to the\nrobustness of our solution as it starts from 2D-slices to 3D-patient level.\nThroughout the evaluation process, our approach resulted in 0.7 macro F1 score\non the COV19-CT -DB validation set. To ensure the reliability and effectiveness\nof our model, we rigorously validate it on the extensive COV-19 CT dataset,\nwhich is meticulously annotated for the task. This dataset, with its\ncomprehensive annotations, reinforces the overall robustness of our solution.\n","authors":["Kenan Morani"],"pdf_url":"https://arxiv.org/pdf/2310.08165v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16604v1","updated":"2023-11-28T08:44:04Z","published":"2023-11-28T08:44:04Z","title":"LC4SV: A Denoising Framework Learning to Compensate for Unseen Speaker\n Verification Models","summary":" The performance of speaker verification (SV) models may drop dramatically in\nnoisy environments. A speech enhancement (SE) module can be used as a front-end\nstrategy. However, existing SE methods may fail to bring performance\nimprovements to downstream SV systems due to artifacts in the predicted signals\nof SE models. To compensate for artifacts, we propose a generic denoising\nframework named LC4SV, which can serve as a pre-processor for various unknown\ndownstream SV models. In LC4SV, we employ a learning-based interpolation agent\nto automatically generate the appropriate coefficients between the enhanced\nsignal and its noisy input to improve SV performance in noisy environments. Our\nexperimental results demonstrate that LC4SV consistently improves the\nperformance of various unseen SV systems. To the best of our knowledge, this\nwork is the first attempt to develop a learning-based interpolation scheme\naiming at improving SV performance in noisy environments.\n","authors":["Chi-Chang Lee","Hong-Wei Chen","Chu-Song Chen","Hsin-Min Wang","Tsung-Te Liu","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2311.16604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16602v1","updated":"2023-11-28T08:43:10Z","published":"2023-11-28T08:43:10Z","title":"GSP-KalmanNet: Tracking Graph Signals via Neural-Aided Kalman Filtering","summary":" Dynamic systems of graph signals are encountered in various applications,\nincluding social networks, power grids, and transportation. While such systems\ncan often be described as state space (SS) models, tracking graph signals via\nconventional tools based on the Kalman filter (KF) and its variants is\ntypically challenging. This is due to the nonlinearity, high dimensionality,\nirregularity of the domain, and complex modeling associated with real-world\ndynamic systems of graph signals. In this work, we study the tracking of graph\nsignals using a hybrid model-based/data-driven approach. We develop the\nGSP-KalmanNet, which tracks the hidden graphical states from the graphical\nmeasurements by jointly leveraging graph signal processing (GSP) tools and deep\nlearning (DL) techniques. The derivations of the GSP-KalmanNet are based on\nextending the KF to exploit the inherent graph structure via graph frequency\ndomain filtering, which considerably simplifies the computational complexity\nentailed in processing high-dimensional signals and increases the robustness to\nsmall topology changes. Then, we use data to learn the Kalman gain following\nthe recently proposed KalmanNet framework, which copes with partial and\napproximated modeling, without forcing a specific model over the noise\nstatistics. Our empirical results demonstrate that the proposed GSP-KalmanNet\nachieves enhanced accuracy and run time performance as well as improved\nrobustness to model misspecifications compared with both model-based and\ndata-driven benchmarks.\n","authors":["Itay Buchnik","Guy Sagi","Nimrod Leinwand","Yuval Loya","Nir Shlezinger","Tirza Routtenberg"],"pdf_url":"https://arxiv.org/pdf/2311.16602v1.pdf","comment":"Submitted for possible publication in the IEEE"},{"id":"http://arxiv.org/abs/2303.01928v3","updated":"2023-11-28T08:41:59Z","published":"2023-03-03T13:53:36Z","title":"FairShap: A Data Re-weighting Approach for Algorithmic Fairness based on\n Shapley Values","summary":" Algorithmic fairness is of utmost societal importance, yet the current trend\nin large-scale machine learning models requires training with massive datasets\nthat are frequently biased. In this context, pre-processing methods that focus\non modeling and correcting bias in the data emerge as valuable approaches. In\nthis paper, we propose FairShap, a novel instance-level data re-weighting\nmethod for fair algorithmic decision-making through data valuation by means of\nShapley Values. FairShap is model-agnostic and easily interpretable, as it\nmeasures the contribution of each training data point to a predefined fairness\nmetric. We empirically validate FairShap on several state-of-the-art datasets\nof different nature, with a variety of training scenarios and models and show\nhow it yields fairer models with similar levels of accuracy than the baselines.\nWe illustrate FairShap's interpretability by means of histograms and latent\nspace visualizations. Moreover, we perform a utility-fairness study, and\nablation and runtime experiments to illustrate the impact of the size of the\nreference dataset and FairShap's computational cost depending on the size of\nthe dataset and the number of features. We believe that FairShap represents a\npromising direction in interpretable and model-agnostic approaches to\nalgorithmic fairness that yield competitive accuracy even when only biased\ndatasets are available.\n","authors":["Adrian Arnaiz-Rodriguez","Nuria Oliver"],"pdf_url":"https://arxiv.org/pdf/2303.01928v3.pdf","comment":"33 pages, 11 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.16595v1","updated":"2023-11-28T08:27:27Z","published":"2023-11-28T08:27:27Z","title":"D4AM: A General Denoising Framework for Downstream Acoustic Models","summary":" The performance of acoustic models degrades notably in noisy environments.\nSpeech enhancement (SE) can be used as a front-end strategy to aid automatic\nspeech recognition (ASR) systems. However, existing training objectives of SE\nmethods are not fully effective at integrating speech-text and noisy-clean\npaired data for training toward unseen ASR systems. In this study, we propose a\ngeneral denoising framework, D4AM, for various downstream acoustic models. Our\nframework fine-tunes the SE model with the backward gradient according to a\nspecific acoustic model and the corresponding classification objective. In\naddition, our method aims to consider the regression objective as an auxiliary\nloss to make the SE model generalize to other unseen acoustic models. To\njointly train an SE unit with regression and classification objectives, D4AM\nuses an adjustment scheme to directly estimate suitable weighting coefficients\nrather than undergoing a grid search process with additional training costs.\nThe adjustment scheme consists of two parts: gradient calibration and\nregression objective weighting. The experimental results show that D4AM can\nconsistently and effectively provide improvements to various unseen acoustic\nmodels and outperforms other combination setups. Specifically, when evaluated\non the Google ASR API with real noisy data completely unseen during SE\ntraining, D4AM achieves a relative WER reduction of 24.65% compared with the\ndirect feeding of noisy input. To our knowledge, this is the first work that\ndeploys an effective combination scheme of regression (denoising) and\nclassification (ASR) objectives to derive a general pre-processor applicable to\nvarious unseen ASR systems. Our code is available at\nhttps://github.com/ChangLee0903/D4AM.\n","authors":["Chi-Chang Lee","Yu Tsao","Hsin-Min Wang","Chu-Song Chen"],"pdf_url":"https://arxiv.org/pdf/2311.16595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16593v1","updated":"2023-11-28T08:18:30Z","published":"2023-11-28T08:18:30Z","title":"Empowering COVID-19 Detection: Optimizing Performance Through Fine-Tuned\n EfficientNet Deep Learning Architecture","summary":" The worldwide COVID-19 pandemic has profoundly influenced the health and\neveryday experiences of individuals across the planet. It is a highly\ncontagious respiratory disease requiring early and accurate detection to curb\nits rapid transmission. Initial testing methods primarily revolved around\nidentifying the genetic composition of the coronavirus, exhibiting a relatively\nlow detection rate and requiring a time-intensive procedure. To address this\nchallenge, experts have suggested using radiological imagery, particularly\nchest X-rays, as a valuable approach within the diagnostic protocol. This study\ninvestigates the potential of leveraging radiographic imaging (X-rays) with\ndeep learning algorithms to swiftly and precisely identify COVID-19 patients.\nThe proposed approach elevates the detection accuracy by fine-tuning with\nappropriate layers on various established transfer learning models. The\nexperimentation was conducted on a COVID-19 X-ray dataset containing 2000\nimages. The accuracy rates achieved were impressive of 100% for EfficientNetB4\nmodel. The fine-tuned EfficientNetB4 achieved an excellent accuracy score,\nshowcasing its potential as a robust COVID-19 detection model. Furthermore,\nEfficientNetB4 excelled in identifying Lung disease using Chest X-ray dataset\ncontaining 4,350 Images, achieving remarkable performance with an accuracy of\n99.17%, precision of 99.13%, recall of 99.16%, and f1-score of 99.14%. These\nresults highlight the promise of fine-tuned transfer learning for efficient\nlung detection through medical imaging, especially with X-ray images. This\nresearch offers radiologists an effective means of aiding rapid and precise\nCOVID-19 diagnosis and contributes valuable assistance for healthcare\nprofessionals in accurately identifying affected patients.\n","authors":["Md. Alamin Talukder","Md. Abu Layek","Mohsin Kazi","Md Ashraf Uddin","Sunil Aryal"],"pdf_url":"https://arxiv.org/pdf/2311.16593v1.pdf","comment":"Computers in Biology and Medicine [Q1, IF: 7.7, CS: 9.2]"},{"id":"http://arxiv.org/abs/2311.16589v1","updated":"2023-11-28T08:15:27Z","published":"2023-11-28T08:15:27Z","title":"Improving Lane Detection Generalization: A Novel Framework using HD Maps\n for Boosting Diversity","summary":" Lane detection is a vital task for vehicles to navigate and localize their\nposition on the road. To ensure reliable results, lane detection algorithms\nmust have robust generalization performance in various road environments.\nHowever, despite the significant performance improvement of deep learning-based\nlane detection algorithms, their generalization performance in response to\nchanges in road environments still falls short of expectations. In this paper,\nwe present a novel framework for single-source domain generalization (SSDG) in\nlane detection. By decomposing data into lane structures and surroundings, we\nenhance diversity using High-Definition (HD) maps and generative models. Rather\nthan expanding data volume, we strategically select a core subset of data,\nmaximizing diversity and optimizing performance. Our extensive experiments\ndemonstrate that our framework enhances the generalization performance of lane\ndetection, comparable to the domain adaptation-based method.\n","authors":["Daeun Lee","Minhyeok Heo","Jiwon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.16589v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.16584v1","updated":"2023-11-28T08:01:43Z","published":"2023-11-28T08:01:43Z","title":"FedAL: Black-Box Federated Knowledge Distillation Enabled by Adversarial\n Learning","summary":" Knowledge distillation (KD) can enable collaborative learning among\ndistributed clients that have different model architectures and do not share\ntheir local data and model parameters with others. Each client updates its\nlocal model using the average model output/feature of all client models as the\ntarget, known as federated KD. However, existing federated KD methods often do\nnot perform well when clients' local models are trained with heterogeneous\nlocal datasets. In this paper, we propose Federated knowledge distillation\nenabled by Adversarial Learning (FedAL) to address the data heterogeneity among\nclients. First, to alleviate the local model output divergence across clients\ncaused by data heterogeneity, the server acts as a discriminator to guide\nclients' local model training to achieve consensus model outputs among clients\nthrough a min-max game between clients and the discriminator. Moreover,\ncatastrophic forgetting may happen during the clients' local training and\nglobal knowledge transfer due to clients' heterogeneous local data. Towards\nthis challenge, we design the less-forgetting regularization for both local\ntraining and global knowledge transfer to guarantee clients' ability to\ntransfer/learn knowledge to/from others. Experimental results show that FedAL\nand its variants achieve higher accuracy than other federated KD baselines.\n","authors":["Pengchao Han","Xingyan Shi","Jianwei Huang"],"pdf_url":"https://arxiv.org/pdf/2311.16584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14714v2","updated":"2023-11-28T07:32:55Z","published":"2023-10-23T08:51:05Z","title":"BatteryML:An Open-source platform for Machine Learning on Battery\n Degradation","summary":" Battery degradation remains a pivotal concern in the energy storage domain,\nwith machine learning emerging as a potent tool to drive forward insights and\nsolutions. However, this intersection of electrochemical science and machine\nlearning poses complex challenges. Machine learning experts often grapple with\nthe intricacies of battery science, while battery researchers face hurdles in\nadapting intricate models tailored to specific datasets. Beyond this, a\ncohesive standard for battery degradation modeling, inclusive of data formats\nand evaluative benchmarks, is conspicuously absent. Recognizing these\nimpediments, we present BatteryML - a one-step, all-encompass, and open-source\nplatform designed to unify data preprocessing, feature extraction, and the\nimplementation of both traditional and state-of-the-art models. This\nstreamlined approach promises to enhance the practicality and efficiency of\nresearch applications. BatteryML seeks to fill this void, fostering an\nenvironment where experts from diverse specializations can collaboratively\ncontribute, thus elevating the collective understanding and advancement of\nbattery research.The code for our project is publicly available on GitHub at\nhttps://github.com/microsoft/BatteryML.\n","authors":["Han Zhang","Xiaofan Gui","Shun Zheng","Ziheng Lu","Yuqi Li","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2310.14714v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16556v1","updated":"2023-11-28T06:52:53Z","published":"2023-11-28T06:52:53Z","title":"Scalable Label Distribution Learning for Multi-Label Classification","summary":" Multi-label classification (MLC) refers to the problem of tagging a given\ninstance with a set of relevant labels. Most existing MLC methods are based on\nthe assumption that the correlation of two labels in each label pair is\nsymmetric, which is violated in many real-world scenarios. Moreover, most\nexisting methods design learning processes associated with the number of\nlabels, which makes their computational complexity a bottleneck when scaling up\nto large-scale output space. To tackle these issues, we propose a novel MLC\nlearning method named Scalable Label Distribution Learning (SLDL) for\nmulti-label classification which can describe different labels as distributions\nin a latent space, where the label correlation is asymmetric and the dimension\nis independent of the number of labels. Specifically, SLDL first converts\nlabels into continuous distributions within a low-dimensional latent space and\nleverages the asymmetric metric to establish the correlation between different\nlabels. Then, it learns the mapping from the feature space to the latent space,\nresulting in the computational complexity is no longer related to the number of\nlabels. Finally, SLDL leverages a nearest-neighbor-based strategy to decode the\nlatent representations and obtain the final predictions. Our extensive\nexperiments illustrate that SLDL can achieve very competitive classification\nperformances with little computational consumption.\n","authors":["Xingyu Zhao","Yuexuan An","Lei Qi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2311.16556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.04840v5","updated":"2023-11-28T06:39:41Z","published":"2021-08-10T18:00:14Z","title":"Post-hoc Interpretability for Neural NLP: A Survey","summary":" Neural networks for NLP are becoming increasingly complex and widespread, and\nthere is a growing concern if these models are responsible to use. Explaining\nmodels helps to address the safety and ethical concerns and is essential for\naccountability. Interpretability serves to provide these explanations in terms\nthat are understandable to humans. Additionally, post-hoc methods provide\nexplanations after a model is learned and are generally model-agnostic. This\nsurvey provides a categorization of how recent post-hoc interpretability\nmethods communicate explanations to humans, it discusses each method in-depth,\nand how they are validated, as the latter is often a common concern.\n","authors":["Andreas Madsen","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2108.04840v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05204v2","updated":"2023-11-28T06:38:03Z","published":"2023-10-08T15:35:00Z","title":"Towards Optimizing with Large Language Models","summary":" In this work, we conduct an assessment of the optimization capabilities of\nLLMs across various tasks and data sizes. Each of these tasks corresponds to\nunique optimization domains, and LLMs are required to execute these tasks with\ninteractive prompting. That is, in each optimization step, the LLM generates\nnew solutions from the past generated solutions with their values, and then the\nnew solutions are evaluated and considered in the next optimization step.\nAdditionally, we introduce three distinct metrics for a comprehensive\nassessment of task performance from various perspectives. These metrics offer\nthe advantage of being applicable for evaluating LLM performance across a broad\nspectrum of optimization tasks and are less sensitive to variations in test\nsamples. By applying these metrics, we observe that LLMs exhibit strong\noptimization capabilities when dealing with small-sized samples. However, their\nperformance is significantly influenced by factors like data size and values,\nunderscoring the importance of further research in the domain of optimization\ntasks for LLMs.\n","authors":["Pei-Fu Guo","Ying-Hsuan Chen","Yun-Da Tsai","Shou-De Lin"],"pdf_url":"https://arxiv.org/pdf/2310.05204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16507v1","updated":"2023-11-28T06:19:30Z","published":"2023-11-28T06:19:30Z","title":"Exploring Straighter Trajectories of Flow Matching with Diffusion\n Guidance","summary":" Flow matching as a paradigm of generative model achieves notable success\nacross various domains. However, existing methods use either multi-round\ntraining or knowledge within minibatches, posing challenges in finding a\nfavorable coupling strategy for straight trajectories. To address this issue,\nwe propose a novel approach, Straighter trajectories of Flow Matching\n(StraightFM). It straightens trajectories with the coupling strategy guided by\ndiffusion model from entire distribution level. First, we propose a coupling\nstrategy to straighten trajectories, creating couplings between image and noise\nsamples under diffusion model guidance. Second, StraightFM also integrates real\ndata to enhance training, employing a neural network to parameterize another\ncoupling process from images to noise samples. StraightFM is jointly optimized\nwith couplings from above two mutually complementary directions, resulting in\nstraighter trajectories and enabling both one-step and few-step generation.\nExtensive experiments demonstrate that StraightFM yields high quality samples\nwith fewer step. StraightFM generates visually appealing images with a lower\nFID among diffusion and traditional flow matching methods within 5 sampling\nsteps when trained on pixel space. In the latent space (i.e., Latent\nDiffusion), StraightFM achieves a lower KID value compared to existing methods\non the CelebA-HQ 256 dataset in fewer than 10 sampling steps.\n","authors":["Siyu Xing","Jie Cao","Huaibo Huang","Xiao-Yu Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2311.16507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.08164v4","updated":"2023-11-28T06:17:30Z","published":"2022-06-16T13:33:22Z","title":"Long Range Graph Benchmark","summary":" Graph Neural Networks (GNNs) that are based on the message passing (MP)\nparadigm generally exchange information between 1-hop neighbors to build node\nrepresentations at each layer. In principle, such networks are not able to\ncapture long-range interactions (LRI) that may be desired or necessary for\nlearning a given task on graphs. Recently, there has been an increasing\ninterest in development of Transformer-based methods for graphs that can\nconsider full node connectivity beyond the original sparse structure, thus\nenabling the modeling of LRI. However, MP-GNNs that simply rely on 1-hop\nmessage passing often fare better in several existing graph benchmarks when\ncombined with positional feature representations, among other innovations,\nhence limiting the perceived utility and ranking of Transformer-like\narchitectures. Here, we present the Long Range Graph Benchmark (LRGB) with 5\ngraph learning datasets: PascalVOC-SP, COCO-SP, PCQM-Contact, Peptides-func and\nPeptides-struct that arguably require LRI reasoning to achieve strong\nperformance in a given task. We benchmark both baseline GNNs and Graph\nTransformer networks to verify that the models which capture long-range\ndependencies perform significantly better on these tasks. Therefore, these\ndatasets are suitable for benchmarking and exploration of MP-GNNs and Graph\nTransformer architectures that are intended to capture LRI.\n","authors":["Vijay Prakash Dwivedi","Ladislav Rampášek","Mikhail Galkin","Ali Parviz","Guy Wolf","Anh Tuan Luu","Dominique Beaini"],"pdf_url":"https://arxiv.org/pdf/2206.08164v4.pdf","comment":"Added reference to T\\\"onshoff et al., 2023 in Sec. 4.1; NeurIPS 2022\n Track on D&B; Open-sourced at: https://github.com/vijaydwivedi75/lrgb"},{"id":"http://arxiv.org/abs/2311.16540v1","updated":"2023-11-28T06:12:57Z","published":"2023-11-28T06:12:57Z","title":"Communication Efficiency Optimization of Federated Learning for\n Computing and Network Convergence of 6G Networks","summary":" Federated learning effectively addresses issues such as data privacy by\ncollaborating across participating devices to train global models. However,\nfactors such as network topology and device computing power can affect its\ntraining or communication process in complex network environments. A new\nnetwork architecture and paradigm with computing-measurable, perceptible,\ndistributable, dispatchable, and manageable capabilities, computing and network\nconvergence (CNC) of 6G networks can effectively support federated learning\ntraining and improve its communication efficiency. By guiding the participating\ndevices' training in federated learning based on business requirements,\nresource load, network conditions, and arithmetic power of devices, CNC can\nreach this goal. In this paper, to improve the communication efficiency of\nfederated learning in complex networks, we study the communication efficiency\noptimization of federated learning for computing and network convergence of 6G\nnetworks, methods that gives decisions on its training process for different\nnetwork conditions and arithmetic power of participating devices in federated\nlearning. The experiments address two architectures that exist for devices in\nfederated learning and arrange devices to participate in training based on\narithmetic power while achieving optimization of communication efficiency in\nthe process of transferring model parameters. The results show that the method\nwe proposed can (1) cope well with complex network situations (2) effectively\nbalance the delay distribution of participating devices for local training (3)\nimprove the communication efficiency during the transfer of model parameters\n(4) improve the resource utilization in the network.\n","authors":["Yizhuo Cai","Bo Lei","Qianying Zhao","Jing Peng","Min Wei","Yushun Zhang","Xing Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.16540v1.pdf","comment":"13 pages, 11 figures, accepted by Frontiers of Information Technology\n & Electronic Engineering"},{"id":"http://arxiv.org/abs/2211.05228v2","updated":"2023-11-28T06:09:31Z","published":"2022-11-07T09:38:34Z","title":"FIXED: Frustratingly Easy Domain Generalization with Mixup","summary":" Domain generalization (DG) aims to learn a generalizable model from multiple\ntraining domains such that it can perform well on unseen target domains. A\npopular strategy is to augment training data to benefit generalization through\nmethods such as Mixup~\\cite{zhang2018mixup}. While the vanilla Mixup can be\ndirectly applied, theoretical and empirical investigations uncover several\nshortcomings that limit its performance. Firstly, Mixup cannot effectively\nidentify the domain and class information that can be used for learning\ninvariant representations. Secondly, Mixup may introduce synthetic noisy data\npoints via random interpolation, which lowers its discrimination capability.\nBased on the analysis, we propose a simple yet effective enhancement for\nMixup-based DG, namely domain-invariant Feature mIXup (FIX). It learns\ndomain-invariant representations for Mixup. To further enhance discrimination,\nwe leverage existing techniques to enlarge margins among classes to further\npropose the domain-invariant Feature MIXup with Enhanced Discrimination (FIXED)\napproach. We present theoretical insights about guarantees on its\neffectiveness. Extensive experiments on seven public datasets across two\nmodalities including image classification (Digits-DG, PACS, Office-Home) and\ntime series (DSADS, PAMAP2, UCI-HAR, and USC-HAD) demonstrate that our approach\nsignificantly outperforms nine state-of-the-art related methods, beating the\nbest performing baseline by 6.5\\% on average in terms of test accuracy. Code is\navailable at:\nhttps://github.com/jindongwang/transferlearning/tree/master/code/deep/fixed.\n","authors":["Wang Lu","Jindong Wang","Han Yu","Lei Huang","Xiang Zhang","Yiqiang Chen","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2211.05228v2.pdf","comment":"First Conference on Parsimony and Learning (CPAL) 2024; code for DG\n at: https://github.com/jindongwang/transferlearning/tree/master/code/DeepDG"},{"id":"http://arxiv.org/abs/2311.16538v1","updated":"2023-11-28T06:08:16Z","published":"2023-11-28T06:08:16Z","title":"Federated Learning with Diffusion Models for Privacy-Sensitive Vision\n Tasks","summary":" Diffusion models have shown great potential for vision-related tasks,\nparticularly for image generation. However, their training is typically\nconducted in a centralized manner, relying on data collected from publicly\navailable sources. This approach may not be feasible or practical in many\ndomains, such as the medical field, which involves privacy concerns over data\ncollection. Despite the challenges associated with privacy-sensitive data, such\ndomains could still benefit from valuable vision services provided by diffusion\nmodels. Federated learning (FL) plays a crucial role in enabling decentralized\nmodel training without compromising data privacy. Instead of collecting data,\nan FL system gathers model parameters, effectively safeguarding the private\ndata of different parties involved. This makes FL systems vital for managing\ndecentralized learning tasks, especially in scenarios where privacy-sensitive\ndata is distributed across a network of clients. Nonetheless, FL presents its\nown set of challenges due to its distributed nature and privacy-preserving\nproperties. Therefore, in this study, we explore the FL strategy to train\ndiffusion models, paving the way for the development of federated diffusion\nmodels. We conduct experiments on various FL scenarios, and our findings\ndemonstrate that federated diffusion models have great potential to deliver\nvision services to privacy-sensitive domains.\n","authors":["Ye Lin Tun","Chu Myaet Thwal","Ji Su Yoon","Sun Moo Kang","Chaoning Zhang","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2311.16538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16536v1","updated":"2023-11-28T05:45:20Z","published":"2023-11-28T05:45:20Z","title":"Personalized Predictions of Glioblastoma Infiltration: Mathematical\n Models, Physics-Informed Neural Networks and Multimodal Scans","summary":" Predicting the infiltration of Glioblastoma (GBM) from medical MRI scans is\ncrucial for understanding tumor growth dynamics and designing personalized\nradiotherapy treatment plans.Mathematical models of GBM growth can complement\nthe data in the prediction of spatial distributions of tumor cells. However,\nthis requires estimating patient-specific parameters of the model from clinical\ndata, which is a challenging inverse problem due to limited temporal data and\nthe limited time between imaging and diagnosis. This work proposes a method\nthat uses Physics-Informed Neural Networks (PINNs) to estimate patient-specific\nparameters of a reaction-diffusion PDE model of GBM growth from a single 3D\nstructural MRI snapshot. PINNs embed both the data and the PDE into a loss\nfunction, thus integrating theory and data. Key innovations include the\nidentification and estimation of characteristic non-dimensional parameters, a\npre-training step that utilizes the non-dimensional parameters and a\nfine-tuning step to determine the patient specific parameters. Additionally,\nthe diffuse domain method is employed to handle the complex brain geometry\nwithin the PINN framework. Our method is validated both on synthetic and\npatient datasets, and shows promise for real-time parametric inference in the\nclinical setting for personalized GBM treatment.\n","authors":["Ray Zirui Zhang","Ivan Ezhov","Michal Balcerak","Andy Zhu","Benedikt Wiestler","Bjoern Menze","John Lowengrub"],"pdf_url":"https://arxiv.org/pdf/2311.16536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16535v1","updated":"2023-11-28T05:44:26Z","published":"2023-11-28T05:44:26Z","title":"Contrastive encoder pre-training-based clustered federated learning for\n heterogeneous data","summary":" Federated learning (FL) is a promising approach that enables distributed\nclients to collaboratively train a global model while preserving their data\nprivacy. However, FL often suffers from data heterogeneity problems, which can\nsignificantly affect its performance. To address this, clustered federated\nlearning (CFL) has been proposed to construct personalized models for different\nclient clusters. One effective client clustering strategy is to allow clients\nto choose their own local models from a model pool based on their performance.\nHowever, without pre-trained model parameters, such a strategy is prone to\nclustering failure, in which all clients choose the same model. Unfortunately,\ncollecting a large amount of labeled data for pre-training can be costly and\nimpractical in distributed environments. To overcome this challenge, we\nleverage self-supervised contrastive learning to exploit unlabeled data for the\npre-training of FL systems. Together, self-supervised pre-training and client\nclustering can be crucial components for tackling the data heterogeneity issues\nof FL. Leveraging these two crucial strategies, we propose contrastive\npre-training-based clustered federated learning (CP-CFL) to improve the model\nconvergence and overall performance of FL systems. In this work, we demonstrate\nthe effectiveness of CP-CFL through extensive experiments in heterogeneous FL\nsettings, and present various interesting observations.\n","authors":["Ye Lin Tun","Minh N. H. Nguyen","Chu Myaet Thwal","Jinwoo Choi","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2311.16535v1.pdf","comment":"Published in Neural Networks"},{"id":"http://arxiv.org/abs/2310.08164v2","updated":"2023-11-28T05:36:12Z","published":"2023-10-12T09:36:03Z","title":"Interpreting Reward Models in RLHF-Tuned Language Models Using Sparse\n Autoencoders","summary":" Large language models (LLMs) aligned to human preferences via reinforcement\nlearning from human feedback (RLHF) underpin many commercial applications of\nLLM technology. Despite this, the impacts of RLHF on LLM internals remain\nopaque. We propose a novel method for interpreting implicit reward models\n(IRMs) in LLMs learned through RLHF. Our approach trains pairs of autoencoders\non activations from a base LLM and its RLHF-tuned variant. Through a comparison\nof autoencoder hidden spaces, we identify features that reflect the accuracy of\nthe learned IRM. To illustrate our method, we fine-tune an LLM via RLHF to\nlearn a token-utility mapping and maximize the aggregate utility of generated\ntext. This is the first application of sparse autoencoders to interpreting\nIRMs. Our method provides an abstract approximation of reward integrity and\nholds promise for measuring alignment between specified objectives and learned\nmodel behaviors.\n","authors":["Luke Marks","Amir Abdullah","Luna Mendez","Rauno Arike","Philip Torr","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2310.08164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09506v2","updated":"2023-11-28T05:31:06Z","published":"2023-11-16T02:06:23Z","title":"Investigating the Impact of Weight Sharing Decisions on Knowledge\n Transfer in Continual Learning","summary":" Continual Learning (CL) has generated attention as a method of avoiding\nCatastrophic Forgetting (CF) in the sequential training of neural networks,\nimproving network efficiency and adaptability to different tasks. Additionally,\nCL serves as an ideal setting for studying network behavior and Forward\nKnowledge Transfer (FKT) between tasks. Pruning methods for CL train\nsubnetworks to handle the sequential tasks which allows us to take a structured\napproach to investigating FKT. Sharing prior subnetworks' weights leverages\npast knowledge for the current task through FKT. Understanding which weights to\nshare is important as sharing all weights can yield sub-optimal accuracy. This\npaper investigates how different sharing decisions affect the FKT between\ntasks. Through this lens we demonstrate how task complexity and similarity\ninfluence the optimal weight sharing decisions, giving insights into the\nrelationships between tasks and helping inform decision making in similar CL\nmethods. We implement three sequential datasets designed to emphasize variation\nin task complexity and similarity, reporting results for both ResNet-18 and\nVGG-16. By sharing in accordance with the decisions supported by our findings,\nwe show that we can improve task accuracy compared to other sharing decisions.\n","authors":["Josh Andle","Ali Payani","Salimeh Yasaei-Sekeh"],"pdf_url":"https://arxiv.org/pdf/2311.09506v2.pdf","comment":"5 Figures, 4 Tables, 2 Algorithms"},{"id":"http://arxiv.org/abs/2311.07222v2","updated":"2023-11-28T05:29:19Z","published":"2023-11-13T10:40:17Z","title":"Neural General Circulation Models","summary":" General circulation models (GCMs) are the foundation of weather and climate\nprediction. GCMs are physics-based simulators which combine a numerical solver\nfor large-scale dynamics with tuned representations for small-scale processes\nsuch as cloud formation. Recently, machine learning (ML) models trained on\nreanalysis data achieved comparable or better skill than GCMs for deterministic\nweather forecasting. However, these models have not demonstrated improved\nensemble forecasts, or shown sufficient stability for long-term weather and\nclimate simulations. Here we present the first GCM that combines a\ndifferentiable solver for atmospheric dynamics with ML components, and show\nthat it can generate forecasts of deterministic weather, ensemble weather and\nclimate on par with the best ML and physics-based methods. NeuralGCM is\ncompetitive with ML models for 1-10 day forecasts, and with the European Centre\nfor Medium-Range Weather Forecasts ensemble prediction for 1-15 day forecasts.\nWith prescribed sea surface temperature, NeuralGCM can accurately track climate\nmetrics such as global mean temperature for multiple decades, and climate\nforecasts with 140 km resolution exhibit emergent phenomena such as realistic\nfrequency and trajectories of tropical cyclones. For both weather and climate,\nour approach offers orders of magnitude computational savings over conventional\nGCMs. Our results show that end-to-end deep learning is compatible with tasks\nperformed by conventional GCMs, and can enhance the large-scale physical\nsimulations that are essential for understanding and predicting the Earth\nsystem.\n","authors":["Dmitrii Kochkov","Janni Yuval","Ian Langmore","Peter Norgaard","Jamie Smith","Griffin Mooers","James Lottes","Stephan Rasp","Peter Düben","Milan Klöwer","Sam Hatfield","Peter Battaglia","Alvaro Sanchez-Gonzalez","Matthew Willson","Michael P. Brenner","Stephan Hoyer"],"pdf_url":"https://arxiv.org/pdf/2311.07222v2.pdf","comment":"67 pages, 34 figures"},{"id":"http://arxiv.org/abs/2206.13269v2","updated":"2023-11-28T05:29:06Z","published":"2022-06-27T13:02:59Z","title":"Wasserstein Distributionally Robust Estimation in High Dimensions:\n Performance Analysis and Optimal Hyperparameter Tuning","summary":" Wasserstein distributionally robust optimization has recently emerged as a\npowerful framework for robust estimation, enjoying good out-of-sample\nperformance guarantees, well-understood regularization effects, and\ncomputationally tractable reformulations. In such framework, the estimator is\nobtained by minimizing the worst-case expected loss over all probability\ndistributions which are close, in a Wasserstein sense, to the empirical\ndistribution. In this paper, we propose a Wasserstein distributionally robust\nestimation framework to estimate an unknown parameter from noisy linear\nmeasurements, and we focus on the task of analyzing the squared error\nperformance of such estimators. Our study is carried out in the modern\nhigh-dimensional proportional regime, where both the ambient dimension and the\nnumber of samples go to infinity at a proportional rate which encodes the\nunder/over-parametrization of the problem. Under an isotropic Gaussian features\nassumption, we show that the squared error can be recovered as the solution of\na convex-concave optimization problem which, surprinsingly, involves at most\nfour scalar variables. Importantly, the precise quantification of the squared\nerror allows to accurately and efficiently compare different ambiguity radii\nand to understand the effect of the under/over-parametrization on the\nestimation error. We conclude the paper with a list of exciting research\ndirections enabled by our results.\n","authors":["Liviu Aolaritei","Soroosh Shafiee","Florian Dörfler"],"pdf_url":"https://arxiv.org/pdf/2206.13269v2.pdf","comment":"This paper was previously titled \"The Performance of Wasserstein\n Distributionally Robust M-Estimators in High Dimensions\""},{"id":"http://arxiv.org/abs/2310.05898v4","updated":"2023-11-28T05:21:41Z","published":"2023-10-09T17:41:29Z","title":"Lion Secretly Solves Constrained Optimization: As Lyapunov Predicts","summary":" Lion (Evolved Sign Momentum), a new optimizer discovered through program\nsearch, has shown promising results in training large AI models. It performs\ncomparably or favorably to AdamW but with greater memory efficiency. As we can\nexpect from the results of a random search program, Lion incorporates elements\nfrom several existing algorithms, including signed momentum, decoupled weight\ndecay, Polak, and Nesterov momentum, but does not fit into any existing\ncategory of theoretically grounded optimizers. Thus, even though Lion appears\nto perform well as a general-purpose optimizer for a wide range of tasks, its\ntheoretical basis remains uncertain. This lack of theoretical clarity limits\nopportunities to further enhance and expand Lion's efficacy.\n This work aims to demystify Lion. Based on both continuous-time and\ndiscrete-time analysis, we demonstrate that Lion is a theoretically novel and\nprincipled approach for minimizing a general loss function $f(x)$ while\nenforcing a bound constraint $\\|x\\|_\\infty \\leq 1/\\lambda$. Lion achieves this\nthrough the incorporation of decoupled weight decay, where $\\lambda$ represents\nthe weight decay coefficient. Our analysis is made possible by the development\nof a new Lyapunov function for the Lion updates. It applies to a broader family\nof Lion-$\\kappa$ algorithms, where the $\\text{sign}(\\cdot)$ operator in Lion is\nreplaced by the subgradient of a convex function $\\kappa$, leading to the\nsolution of a general composite optimization problem of $\\min_x f(x) +\n\\kappa^*(x)$. Our findings provide valuable insights into the dynamics of Lion\nand pave the way for further improvements and extensions of Lion-related\nalgorithms.\n","authors":["Lizhang Chen","Bo Liu","Kaizhao Liang","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05898v4.pdf","comment":"31 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.16528v1","updated":"2023-11-28T05:19:23Z","published":"2023-11-28T05:19:23Z","title":"Utility Fairness in Contextual Dynamic Pricing with Demand Learning","summary":" This paper introduces a novel contextual bandit algorithm for personalized\npricing under utility fairness constraints in scenarios with uncertain demand,\nachieving an optimal regret upper bound. Our approach, which incorporates\ndynamic pricing and demand learning, addresses the critical challenge of\nfairness in pricing strategies. We first delve into the static full-information\nsetting to formulate an optimal pricing policy as a constrained optimization\nproblem. Here, we propose an approximation algorithm for efficiently and\napproximately computing the ideal policy.\n We also use mathematical analysis and computational studies to characterize\nthe structures of optimal contextual pricing policies subject to fairness\nconstraints, deriving simplified policies which lays the foundations of more\nin-depth research and extensions.\n Further, we extend our study to dynamic pricing problems with demand\nlearning, establishing a non-standard regret lower bound that highlights the\ncomplexity added by fairness constraints. Our research offers a comprehensive\nanalysis of the cost of fairness and its impact on the balance between utility\nand revenue maximization. This work represents a step towards integrating\nethical considerations into algorithmic efficiency in data-driven dynamic\npricing.\n","authors":["Xi Chen","David Simchi-Levi","Yining Wang"],"pdf_url":"https://arxiv.org/pdf/2311.16528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14053v2","updated":"2023-11-28T05:18:31Z","published":"2023-09-25T11:35:10Z","title":"Revisiting LARS for Large Batch Training Generalization of Neural\n Networks","summary":" LARS and LAMB have emerged as prominent techniques in Large Batch Learning\n(LBL) to ensure training stability in AI. Convergence stability is a challenge\nin LBL, where the AI agent usually gets trapped in the sharp minimizer. To\naddress this challenge, warm-up is an efficient technique, but it lacks a\nstrong theoretical foundation. Specifically, the warm-up process often reduces\ngradients in the early phase, inadvertently preventing the agent from escaping\nthe sharp minimizer early on. In light of this situation, we conduct empirical\nexperiments to analyze the behaviors of LARS and LAMB with and without a\nwarm-up strategy. Our analyses give a comprehensive insight into the behaviors\nof LARS, LAMB, and the necessity of a warm-up technique in LBL, including an\nexplanation of their failure in many cases. Building upon these insights, we\npropose a novel algorithm called Time Varying LARS (TVLARS), which facilitates\nrobust training in the initial phase without the need for warm-up. A\nconfigurable sigmoid-like function is employed in TVLARS to replace the warm-up\nprocess to enhance training stability. Moreover, TVLARS stimulates gradient\nexploration in the early phase, thus allowing it to surpass the sharp minimizes\nearly on and gradually transition to LARS and achieving robustness of LARS in\nthe latter phases. Extensive experimental evaluations reveal that TVLARS\nconsistently outperforms LARS and LAMB in most cases, with improvements of up\nto 2% in classification scenarios. Notably, in every case of self-supervised\nlearning, TVLARS dominates LARS and LAMB with performance improvements of up to\n10%.\n","authors":["Khoi Do","Duong Nguyen","Hoa Nguyen","Long Tran-Thanh","Quoc-Viet Pham"],"pdf_url":"https://arxiv.org/pdf/2309.14053v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16526v1","updated":"2023-11-28T05:11:53Z","published":"2023-11-28T05:11:53Z","title":"On robust overfitting: adversarial training induced distribution matters","summary":" Adversarial training may be regarded as standard training with a modified\nloss function. But its generalization error appears much larger than standard\ntraining under standard loss. This phenomenon, known as robust overfitting, has\nattracted significant research attention and remains largely as a mystery. In\nthis paper, we first show empirically that robust overfitting correlates with\nthe increasing generalization difficulty of the perturbation-induced\ndistributions along the trajectory of adversarial training (specifically\nPGD-based adversarial training). We then provide a novel upper bound for\ngeneralization error with respect to the perturbation-induced distributions, in\nwhich a notion of the perturbation operator, referred to \"local dispersion\",\nplays an important role.\n","authors":["Runzhi Tian","Yongyi Mao"],"pdf_url":"https://arxiv.org/pdf/2311.16526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18766v3","updated":"2023-11-28T05:09:02Z","published":"2023-05-30T05:56:58Z","title":"HiFA: High-fidelity Text-to-3D Generation with Advanced Diffusion\n Guidance","summary":" The advancements in automatic text-to-3D generation have been remarkable.\nMost existing methods use pre-trained text-to-image diffusion models to\noptimize 3D representations like Neural Radiance Fields (NeRFs) via\nlatent-space denoising score matching. Yet, these methods often result in\nartifacts and inconsistencies across different views due to their suboptimal\noptimization approaches and limited understanding of 3D geometry. Moreover, the\ninherent constraints of NeRFs in rendering crisp geometry and stable textures\nusually lead to a two-stage optimization to attain high-resolution details.\nThis work proposes holistic sampling and smoothing approaches to achieve\nhigh-quality text-to-3D generation, all in a single-stage optimization. We\ncompute denoising scores in the text-to-image diffusion model's latent and\nimage spaces. Instead of randomly sampling timesteps (also referred to as noise\nlevels in denoising score matching), we introduce a novel timestep annealing\napproach that progressively reduces the sampled timestep throughout\noptimization. To generate high-quality renderings in a single-stage\noptimization, we propose regularization for the variance of z-coordinates along\nNeRF rays. To address texture flickering issues in NeRFs, we introduce a kernel\nsmoothing technique that refines importance sampling weights coarse-to-fine,\nensuring accurate and thorough sampling in high-density regions. Extensive\nexperiments demonstrate the superiority of our method over previous approaches,\nenabling the generation of highly detailed and view-consistent 3D assets\nthrough a single-stage training process.\n","authors":["Junzhe Zhu","Peiye Zhuang"],"pdf_url":"https://arxiv.org/pdf/2305.18766v3.pdf","comment":"Project page: https://hifa-team.github.io/HiFA-site/"},{"id":"http://arxiv.org/abs/2311.16524v1","updated":"2023-11-28T05:06:22Z","published":"2023-11-28T05:06:22Z","title":"3D Teeth Reconstruction from Panoramic Radiographs using Neural Implicit\n Functions","summary":" Panoramic radiography is a widely used imaging modality in dental practice\nand research. However, it only provides flattened 2D images, which limits the\ndetailed assessment of dental structures. In this paper, we propose Occudent, a\nframework for 3D teeth reconstruction from panoramic radiographs using neural\nimplicit functions, which, to the best of our knowledge, is the first work to\ndo so. For a given point in 3D space, the implicit function estimates whether\nthe point is occupied by a tooth, and thus implicitly determines the boundaries\nof 3D tooth shapes. Firstly, Occudent applies multi-label segmentation to the\ninput panoramic radiograph. Next, tooth shape embeddings as well as tooth class\nembeddings are generated from the segmentation outputs, which are fed to the\nreconstruction network. A novel module called Conditional eXcitation (CX) is\nproposed in order to effectively incorporate the combined shape and class\nembeddings into the implicit function. The performance of Occudent is evaluated\nusing both quantitative and qualitative measures. Importantly, Occudent is\ntrained and validated with actual panoramic radiographs as input, distinct from\nrecent works which used synthesized images. Experiments demonstrate the\nsuperiority of Occudent over state-of-the-art methods.\n","authors":["Sihwa Park","Seongjun Kim","In-Seok Song","Seung Jun Baek"],"pdf_url":"https://arxiv.org/pdf/2311.16524v1.pdf","comment":"12 pages, 2 figures, accepted to International Conference on Medical\n Image Computing and Computer-Assisted Intervention MICCAI 2023"},{"id":"http://arxiv.org/abs/2311.16522v1","updated":"2023-11-28T05:00:27Z","published":"2023-11-28T05:00:27Z","title":"Evaluation of dynamic characteristics of power grid based on GNN and\n application on knowledge graph","summary":" A novel method for detecting faults in power grids using a graph neural\nnetwork (GNN) has been developed, aimed at enhancing intelligent fault\ndiagnosis in network operation and maintenance. This GNN-based approach\nidentifies faulty nodes within the power grid through a specialized electrical\nfeature extraction model coupled with a knowledge graph. Incorporating temporal\ndata, the method leverages the status of nodes from preceding and subsequent\ntime periods to aid in current fault detection. To validate the effectiveness\nof this GNN in extracting node features, a correlation analysis of the output\nfeatures from each node within the neural network layer was conducted. The\nresults from experiments show that this method can accurately locate fault\nnodes in simulated scenarios with a remarkable 99.53% accuracy. Additionally,\nthe graph neural network's feature modeling allows for a qualitative\nexamination of how faults spread across nodes, providing valuable insights for\nanalyzing fault nodes.\n","authors":["Hao Pei","Si Lin","Chuanfu Li","Che Wang","Haoming Chen","Sizhe Li"],"pdf_url":"https://arxiv.org/pdf/2311.16522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16520v1","updated":"2023-11-28T04:58:41Z","published":"2023-11-28T04:58:41Z","title":"Value Approximation for Two-Player General-Sum Differential Games with\n State Constraints","summary":" Solving Hamilton-Jacobi-Isaacs (HJI) PDEs enables equilibrial feedback\ncontrol in two-player differential games, yet faces the curse of dimensionality\n(CoD). While physics-informed machine learning has been adopted to address CoD\nin solving PDEs, this method falls short in learning discontinuous solutions\ndue to its sampling nature, leading to poor safety performance of the resulting\ncontrollers in robotics applications where values are discontinuous due to\nstate or other temporal logic constraints. In this study, we explore three\npotential solutions to this problem: (1) a hybrid learning method that uses\nboth equilibrium demonstrations and the HJI PDE, (2) a value-hardening method\nwhere a sequence of HJIs are solved with increasing Lipschitz constant on the\nconstraint violation penalty, and (3) the epigraphical technique that lifts the\nvalue to a higher dimensional auxiliary state space where the value becomes\ncontinuous. Evaluations through 5D and 9D vehicle simulations and 13D drone\nsimulations reveal that the hybrid method outperforms others in terms of\ngeneralization and safety performance.\n","authors":["Lei Zhang","Mukesh Ghimire","Wenlong Zhang","Zhe Xu","Yi Ren"],"pdf_url":"https://arxiv.org/pdf/2311.16520v1.pdf","comment":"Submitted to TRO"},{"id":"http://arxiv.org/abs/2311.16519v1","updated":"2023-11-28T04:58:17Z","published":"2023-11-28T04:58:17Z","title":"B-LSTM-MIONet: Bayesian LSTM-based Neural Operators for Learning the\n Response of Complex Dynamical Systems to Length-Variant Multiple Input\n Functions","summary":" Deep Operator Network (DeepONet) is a neural network framework for learning\nnonlinear operators such as those from ordinary differential equations (ODEs)\ndescribing complex systems. Multiple-input deep neural operators (MIONet)\nextended DeepONet to allow multiple input functions in different Banach spaces.\nMIONet offers flexibility in training dataset grid spacing, without constraints\non output location. However, it requires offline inputs and cannot handle\nvarying sequence lengths in testing datasets, limiting its real-time\napplication in dynamic complex systems. This work redesigns MIONet, integrating\nLong Short Term Memory (LSTM) to learn neural operators from time-dependent\ndata. This approach overcomes data discretization constraints and harnesses\nLSTM's capability with variable-length, real-time data. Factors affecting\nlearning performance, like algorithm extrapolation ability are presented. The\nframework is enhanced with uncertainty quantification through a novel Bayesian\nmethod, sampling from MIONet parameter distributions. Consequently, we develop\nthe B-LSTM-MIONet, incorporating LSTM's temporal strengths with Bayesian\nrobustness, resulting in a more precise and reliable model for noisy datasets.\n","authors":["Zhihao Kong","Amirhossein Mollaali","Christian Moya","Na Lu","Guang Lin"],"pdf_url":"https://arxiv.org/pdf/2311.16519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16509v1","updated":"2023-11-28T04:49:17Z","published":"2023-11-28T04:49:17Z","title":"StyleCap: Automatic Speaking-Style Captioning from Speech Based on\n Speech and Language Self-supervised Learning Models","summary":" We propose StyleCap, a method to generate natural language descriptions of\nspeaking styles appearing in speech. Although most of conventional techniques\nfor para-/non-linguistic information recognition focus on the category\nclassification or the intensity estimation of pre-defined labels, they cannot\nprovide the reasoning of the recognition result in an interpretable manner. As\na first step towards an end-to-end method for generating speaking-style prompts\nfrom speech, i.e., automatic speaking-style captioning, StyleCap uses paired\ndata of speech and natural language descriptions to train neural networks that\npredict prefix vectors fed into a large language model (LLM)-based text decoder\nfrom a speech representation vector. We explore an appropriate text decoder and\nspeech feature representation suitable for this new task. The experimental\nresults demonstrate that our StyleCap leveraging richer LLMs for the text\ndecoder, speech self-supervised learning (SSL) features, and sentence\nrephrasing augmentation improves the accuracy and diversity of generated\nspeaking-style captions. Samples of speaking-style captions generated by our\nStyleCap are publicly available.\n","authors":["Kazuki Yamauchi","Yusuke Ijima","Yuki Saito"],"pdf_url":"https://arxiv.org/pdf/2311.16509v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2307.12226v2","updated":"2023-11-28T04:35:51Z","published":"2023-07-23T04:48:41Z","title":"Geometry-Aware Adaptation for Pretrained Models","summary":" Machine learning models -- including prominent zero-shot models -- are often\ntrained on datasets whose labels are only a small proportion of a larger label\nspace. Such spaces are commonly equipped with a metric that relates the labels\nvia distances between them. We propose a simple approach to exploit this\ninformation to adapt the trained model to reliably predict new classes -- or,\nin the case of zero-shot prediction, to improve its performance -- without any\nadditional training. Our technique is a drop-in replacement of the standard\nprediction rule, swapping argmax with the Fr\\'echet mean. We provide a\ncomprehensive theoretical analysis for this approach, studying (i)\nlearning-theoretic results trading off label space diameter, sample complexity,\nand model dimension, (ii) characterizations of the full range of scenarios in\nwhich it is possible to predict any unobserved class, and (iii) an optimal\nactive learning-like next class selection procedure to obtain optimal training\nclasses for when it is not possible to predict the entire range of unobserved\nclasses. Empirically, using easily-available external metrics, our proposed\napproach, Loki, gains up to 29.7% relative improvement over SimCLR on ImageNet\nand scales to hundreds of thousands of classes. When no such metric is\navailable, Loki can use self-derived metrics from class embeddings and obtains\na 10.5% improvement on pretrained zero-shot models such as CLIP.\n","authors":["Nicholas Roberts","Xintong Li","Dyah Adila","Sonia Cromp","Tzu-Heng Huang","Jitian Zhao","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2307.12226v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.16487v1","updated":"2023-11-28T04:34:04Z","published":"2023-11-28T04:34:04Z","title":"On the Robustness of Decision-Focused Learning","summary":" Decision-Focused Learning (DFL) is an emerging learning paradigm that tackles\nthe task of training a machine learning (ML) model to predict missing\nparameters of an incomplete optimization problem, where the missing parameters\nare predicted. DFL trains an ML model in an end-to-end system, by integrating\nthe prediction and optimization tasks, providing better alignment of the\ntraining and testing objectives. DFL has shown a lot of promise and holds the\ncapacity to revolutionize decision-making in many real-world applications.\nHowever, very little is known about the performance of these models under\nadversarial attacks. We adopt ten unique DFL methods and benchmark their\nperformance under two distinctly focused attacks adapted towards the\nPredict-then-Optimize problem setting. Our study proposes the hypothesis that\nthe robustness of a model is highly correlated with its ability to find\npredictions that lead to optimal decisions without deviating from the\nground-truth label. Furthermore, we provide insight into how to target the\nmodels that violate this condition and show how these models respond\ndifferently depending on the achieved optimality at the end of their training\ncycles.\n","authors":["Yehya Farhat"],"pdf_url":"https://arxiv.org/pdf/2311.16487v1.pdf","comment":"17 pages, 45 figures, submitted to AAAI artificial intelligence for\n operations research workshop"},{"id":"http://arxiv.org/abs/2306.15868v3","updated":"2023-11-28T04:28:48Z","published":"2023-06-28T01:50:46Z","title":"GraSS: Contrastive Learning with Gradient Guided Sampling Strategy for\n Remote Sensing Image Semantic Segmentation","summary":" Self-supervised contrastive learning (SSCL) has achieved significant\nmilestones in remote sensing image (RSI) understanding. Its essence lies in\ndesigning an unsupervised instance discrimination pretext task to extract image\nfeatures from a large number of unlabeled images that are beneficial for\ndownstream tasks. However, existing instance discrimination based SSCL suffer\nfrom two limitations when applied to the RSI semantic segmentation task: 1)\nPositive sample confounding issue; 2) Feature adaptation bias. It introduces a\nfeature adaptation bias when applied to semantic segmentation tasks that\nrequire pixel-level or object-level features. In this study, We observed that\nthe discrimination information can be mapped to specific regions in RSI through\nthe gradient of unsupervised contrastive loss, these specific regions tend to\ncontain singular ground objects. Based on this, we propose contrastive learning\nwith Gradient guided Sampling Strategy (GraSS) for RSI semantic segmentation.\nGraSS consists of two stages: Instance Discrimination warm-up (ID warm-up) and\nGradient guided Sampling contrastive training (GS training). The ID warm-up\naims to provide initial discrimination information to the contrastive loss\ngradients. The GS training stage aims to utilize the discrimination information\ncontained in the contrastive loss gradients and adaptively select regions in\nRSI patches that contain more singular ground objects, in order to construct\nnew positive and negative samples. Experimental results on three open datasets\ndemonstrate that GraSS effectively enhances the performance of SSCL in\nhigh-resolution RSI semantic segmentation. Compared to seven baseline methods\nfrom five different types of SSCL, GraSS achieves an average improvement of\n1.57\\% and a maximum improvement of 3.58\\% in terms of mean intersection over\nthe union. The source code is available at https://github.com/GeoX-Lab/GraSS\n","authors":["Zhaoyang Zhang","Zhen Ren","Chao Tao","Yunsheng Zhang","Chengli Peng","Haifeng Li"],"pdf_url":"https://arxiv.org/pdf/2306.15868v3.pdf","comment":"14 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.12532v3","updated":"2023-11-28T03:42:45Z","published":"2023-08-24T03:43:02Z","title":"FedSOL: Stabilized Orthogonal Learning in Federated Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclient data distributions are heterogeneous. Many previous FL algorithms have\naddressed this issue by introducing various proximal restrictions. These\nrestrictions aim to encourage global alignment by constraining the deviation of\nlocal learning from the global objective. However, they inherently limit local\nlearning by interfering with the original local objectives. Recently, an\nalternative approach has emerged to improve local learning generality. By\nobtaining local models within a smooth loss landscape, this approach mitigates\nconflicts among different local objectives of the clients. Yet, it does not\nensure stable global alignment, as local learning does not take the global\nobjective into account. In this study, we propose Federated Stability on\nLearning (FedSoL), which combines both the concepts of global alignment and\nlocal generality. In FedSoL, the local learning seeks a parameter region robust\nagainst proximal perturbations. This strategy introduces an implicit proximal\nrestriction effect in local learning while maintaining the original local\nobjective for parameter update. Our experiments show that FedSoL consistently\nachieves state-of-the-art performance on various setups.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v3.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.11676v3","updated":"2023-11-28T03:42:30Z","published":"2023-10-18T02:59:57Z","title":"PREM: A Simple Yet Effective Approach for Node-Level Graph Anomaly\n Detection","summary":" Node-level graph anomaly detection (GAD) plays a critical role in identifying\nanomalous nodes from graph-structured data in various domains such as medicine,\nsocial networks, and e-commerce. However, challenges have arisen due to the\ndiversity of anomalies and the dearth of labeled data. Existing methodologies -\nreconstruction-based and contrastive learning - while effective, often suffer\nfrom efficiency issues, stemming from their complex objectives and elaborate\nmodules. To improve the efficiency of GAD, we introduce a simple method termed\nPREprocessing and Matching (PREM for short). Our approach streamlines GAD,\nreducing time and memory consumption while maintaining powerful anomaly\ndetection capabilities. Comprising two modules - a pre-processing module and an\nego-neighbor matching module - PREM eliminates the necessity for\nmessage-passing propagation during training, and employs a simple contrastive\nloss, leading to considerable reductions in training time and memory usage.\nMoreover, through rigorous evaluations of five real-world datasets, our method\ndemonstrated robustness and effectiveness. Notably, when validated on the ACM\ndataset, PREM achieved a 5% improvement in AUC, a 9-fold increase in training\nspeed, and sharply reduce memory usage compared to the most efficient baseline.\n","authors":["Junjun Pan","Yixin Liu","Yizhen Zheng","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2310.11676v3.pdf","comment":"Accepted by IEEE International Conference of Data Mining 2023 (ICDM\n 2023)"},{"id":"http://arxiv.org/abs/2007.10784v3","updated":"2023-11-28T03:35:32Z","published":"2020-07-16T21:14:45Z","title":"OccamNet: A Fast Neural Model for Symbolic Regression at Scale","summary":" Neural networks' expressiveness comes at the cost of complex, black-box\nmodels that often extrapolate poorly beyond the domain of the training dataset,\nconflicting with the goal of finding compact analytic expressions to describe\nscientific data. We introduce OccamNet, a neural network model that finds\ninterpretable, compact, and sparse symbolic fits to data, \\`a la Occam's razor.\nOur model defines a probability distribution over functions with efficient\nsampling and function evaluation. We train by sampling functions and biasing\nthe probability mass toward better fitting solutions, backpropagating using\ncross-entropy matching in a reinforcement-learning loss. OccamNet can identify\nsymbolic fits for a variety of problems, including analytic and non-analytic\nfunctions, implicit functions, and simple image classification, and can\noutperform state-of-the-art symbolic regression methods on real-world\nregression datasets. Our method requires a minimal memory footprint, fits\ncomplicated functions in minutes on a single CPU, and scales on a GPU.\n","authors":["Owen Dugan","Rumen Dangovski","Allan Costa","Samuel Kim","Pawan Goyal","Joseph Jacobson","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2007.10784v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16459v1","updated":"2023-11-28T03:34:22Z","published":"2023-11-28T03:34:22Z","title":"On the Effect of Defections in Federated Learning and How to Prevent\n Them","summary":" Federated learning is a machine learning protocol that enables a large\npopulation of agents to collaborate over multiple rounds to produce a single\nconsensus model. There are several federated learning applications where agents\nmay choose to defect permanently$-$essentially withdrawing from the\ncollaboration$-$if they are content with their instantaneous model in that\nround. This work demonstrates the detrimental impact of such defections on the\nfinal model's robustness and ability to generalize. We also show that current\nfederated optimization algorithms fail to disincentivize these harmful\ndefections. We introduce a novel optimization algorithm with theoretical\nguarantees to prevent defections while ensuring asymptotic convergence to an\neffective solution for all participating agents. We also provide numerical\nexperiments to corroborate our findings and demonstrate the effectiveness of\nour algorithm.\n","authors":["Minbiao Han","Kumar Kshitij Patel","Han Shao","Lingxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.16459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13164v2","updated":"2023-11-28T03:13:42Z","published":"2023-10-19T21:31:11Z","title":"Almost Equivariance via Lie Algebra Convolutions","summary":" Recently, the equivariance of models with respect to a group action has\nbecome an important topic of research in machine learning. However, imbuing an\narchitecture with a specific group equivariance imposes a strong prior on the\ntypes of data transformations that the model expects to see. While\nstrictly-equivariant models enforce symmetries, real-world data does not always\nconform to such strict equivariances, be it due to noise in the data or\nunderlying physical laws that encode only approximate or partial symmetries. In\nsuch cases, the prior of strict equivariance can actually prove too strong and\ncause models to underperform on real-world data. Therefore, in this work we\nstudy a closely related topic, that of almost equivariance. We provide a\ndefinition of almost equivariance that differs from those extant in the current\nliterature and give a practical method for encoding almost equivariance in\nmodels by appealing to the Lie algebra of a Lie group. Specifically, we define\nLie algebra convolutions and demonstrate that they offer several benefits over\nLie group convolutions, including being well-defined for non-compact groups.\nFrom there, we pivot to the realm of theory and demonstrate connections between\nthe notions of equivariance and isometry and those of almost equivariance and\nalmost isometry, respectively. We prove two existence theorems, one showing the\nexistence of almost isometries within bounded distance of isometries of a\ngeneral manifold, and another showing the converse for Hilbert spaces. We then\nextend these theorems to prove the existence of almost equivariant manifold\nembeddings within bounded distance of fully equivariant embedding functions,\nsubject to certain constraints on the group action and the function class.\nFinally, we demonstrate the validity of our approach by benchmarking against\ndatasets in fully equivariant and almost equivariant settings.\n","authors":["Daniel McNeela"],"pdf_url":"https://arxiv.org/pdf/2310.13164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16442v1","updated":"2023-11-28T02:44:59Z","published":"2023-11-28T02:44:59Z","title":"Enabling Fast 2-bit LLM on GPUs: Memory Alignment, Sparse Outlier, and\n Asynchronous Dequantization","summary":" Large language models (LLMs) have demonstrated impressive abilities in\nvarious domains while the inference cost is expensive. The state-of-the-art\nmethods use 2-bit quantization for mainstream LLMs. However, challenges still\nexist: (1) Nonnegligible accuracy loss for 2-bit quantization. Weights are\nquantized by groups, while the ranges of weights are large in some groups,\nresulting in large quantization errors and nonnegligible accuracy loss (e.g.\n>3% for Llama2-7b with 2-bit quantization in GPTQ and Greenbit). (2) Limited\naccuracy improvement by adding 4-bit weights. Increasing 10% extra average bit\nmore 4-bit weights only leads to <0.5% accuracy improvement on a quantized\nLlama2-7b. (3) Time-consuming dequantization operations on GPUs. The\ndequantization operations lead to >50% execution time, hindering the potential\nof reducing LLM inference cost. To tackle these challenges, we propose the\nfollowing techniques: (1) We only quantize a small fraction of groups with the\nlarger range using 4-bit with memory alignment consideration on GPUs. (2) We\npoint out that the distribution of the sparse outliers with larger weights is\ndifferent in 2-bit and 4-bit groups, and only a small fraction of outliers\nrequire 16-bit quantization. Such design leads to >0.5% accuracy improvement\nwith <3% average increased bit for Llama2-7b. (3) We design the asynchronous\ndequantization on GPUs, leading to up to 3.92X speedup. We conduct extensive\nexperiments on different model families and model sizes. We achieve 2.85-bit\nfor each weight and the end-to-end speedup for Llama2-7b is 1.74X over the\noriginal model, and we reduce both runtime cost and hardware cost by up to\n2.70X and 2.81X with less GPU requirements.\n","authors":["Jinhao Li","Shiyao Li","Jiaming Xu","Shan Huang","Yaoxiu Lian","Jun Liu","Yu Wang","Guohao Dai"],"pdf_url":"https://arxiv.org/pdf/2311.16442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00154v2","updated":"2023-11-28T02:28:21Z","published":"2023-06-30T22:05:34Z","title":"Stitched ViTs are Flexible Vision Backbones","summary":" Large pretrained plain vision Transformers (ViTs) have been the workhorse for\nmany downstream tasks. However, existing works utilizing off-the-shelf ViTs are\ninefficient in terms of training and deployment, because adopting ViTs with\nindividual sizes requires separate trainings and is restricted by fixed\nperformance-efficiency trade-offs. In this paper, we are inspired by stitchable\nneural networks (SN-Net), which is a new framework that cheaply produces a\nsingle model that covers rich subnetworks by stitching pretrained model\nfamilies, supporting diverse performance-efficiency trade-offs at runtime.\nBuilding upon this foundation, we introduce SN-Netv2, a systematically improved\nmodel stitching framework to facilitate downstream task adaptation.\nSpecifically, we first propose a two-way stitching scheme to enlarge the\nstitching space. We then design a resource-constrained sampling strategy that\ntakes into account the underlying FLOPs distributions in the space for better\nsampling. Finally, we observe that learning stitching layers as a low-rank\nupdate plays an essential role on downstream tasks to stabilize training and\nensure a good Pareto frontier. With extensive experiments on ImageNet-1K,\nADE20K, COCO-Stuff-10K and NYUv2, SN-Netv2 demonstrates superior performance\nover SN-Netv1 on downstream dense predictions and shows strong ability as a\nflexible vision backbone, achieving great advantages in both training\nefficiency and deployment flexibility. Code is available at\nhttps://github.com/ziplab/SN-Netv2.\n","authors":["Zizheng Pan","Jing Liu","Haoyu He","Jianfei Cai","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2307.00154v2.pdf","comment":"Tech report"},{"id":"http://arxiv.org/abs/2311.16432v1","updated":"2023-11-28T02:27:31Z","published":"2023-11-28T02:27:31Z","title":"Text-Driven Image Editing via Learnable Regions","summary":" Language has emerged as a natural interface for image editing. In this paper,\nwe introduce a method for region-based image editing driven by textual prompts,\nwithout the need for user-provided masks or sketches. Specifically, our\napproach leverages an existing pretrained text-to-image model and introduces a\nbounding box generator to find the edit regions that are aligned with the\ntextual prompts. We show that this simple approach enables flexible editing\nthat is compatible with current image generation models, and is able to handle\ncomplex prompts featuring multiple objects, complex sentences or long\nparagraphs. We conduct an extensive user study to compare our method against\nstate-of-the-art methods. Experiments demonstrate the competitive performance\nof our method in manipulating images with high fidelity and realism that align\nwith the language descriptions provided. Our project webpage:\nhttps://yuanze-lin.me/LearnableRegions_page.\n","authors":["Yuanze Lin","Yi-Wen Chen","Yi-Hsuan Tsai","Lu Jiang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.16432v1.pdf","comment":"Project webpage: https://yuanze-lin.me/LearnableRegions_page"},{"id":"http://arxiv.org/abs/2311.16424v1","updated":"2023-11-28T02:08:06Z","published":"2023-11-28T02:08:06Z","title":"Manifold Preserving Guided Diffusion","summary":" Despite the recent advancements, conditional image generation still faces\nchallenges of cost, generalizability, and the need for task-specific training.\nIn this paper, we propose Manifold Preserving Guided Diffusion (MPGD), a\ntraining-free conditional generation framework that leverages pretrained\ndiffusion models and off-the-shelf neural networks with minimal additional\ninference cost for a broad range of tasks. Specifically, we leverage the\nmanifold hypothesis to refine the guided diffusion steps and introduce a\nshortcut algorithm in the process. We then propose two methods for on-manifold\ntraining-free guidance using pre-trained autoencoders and demonstrate that our\nshortcut inherently preserves the manifolds when applied to latent diffusion\nmodels. Our experiments show that MPGD is efficient and effective for solving a\nvariety of conditional generation applications in low-compute settings, and can\nconsistently offer up to 3.8x speed-ups with the same number of diffusion steps\nwhile maintaining high sample quality compared to the baselines.\n","authors":["Yutong He","Naoki Murata","Chieh-Hsin Lai","Yuhta Takida","Toshimitsu Uesaka","Dongjun Kim","Wei-Hsiang Liao","Yuki Mitsufuji","J. Zico Kolter","Ruslan Salakhutdinov","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2311.16424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16420v1","updated":"2023-11-28T02:00:47Z","published":"2023-11-28T02:00:47Z","title":"Model-free Test Time Adaptation for Out-Of-Distribution Detection","summary":" Out-of-distribution (OOD) detection is essential for the reliability of ML\nmodels. Most existing methods for OOD detection learn a fixed decision\ncriterion from a given in-distribution dataset and apply it universally to\ndecide if a data point is OOD. Recent work~\\cite{fang2022is} shows that given\nonly in-distribution data, it is impossible to reliably detect OOD data without\nextra assumptions. Motivated by the theoretical result and recent exploration\nof test-time adaptation methods, we propose a Non-Parametric Test Time\n\\textbf{Ada}ptation framework for \\textbf{O}ut-Of-\\textbf{D}istribution\n\\textbf{D}etection (\\abbr). Unlike conventional methods, \\abbr utilizes online\ntest samples for model adaptation during testing, enhancing adaptability to\nchanging data distributions. The framework incorporates detected OOD instances\ninto decision-making, reducing false positive rates, particularly when ID and\nOOD distributions overlap significantly. We demonstrate the effectiveness of\n\\abbr through comprehensive experiments on multiple OOD detection benchmarks,\nextensive empirical studies show that \\abbr significantly improves the\nperformance of OOD detection over state-of-the-art methods. Specifically, \\abbr\nreduces the false positive rate (FPR95) by $23.23\\%$ on the CIFAR-10 benchmarks\nand $38\\%$ on the ImageNet-1k benchmarks compared to the advanced methods.\nLastly, we theoretically verify the effectiveness of \\abbr.\n","authors":["YiFan Zhang","Xue Wang","Tian Zhou","Kun Yuan","Zhang Zhang","Liang Wang","Rong Jin","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2311.16420v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.02705v2","updated":"2023-11-28T01:56:17Z","published":"2023-09-06T04:37:20Z","title":"Certifying LLM Safety against Adversarial Prompting","summary":" Large language models (LLMs) released for public use incorporate guardrails\nto ensure their output is safe, often referred to as \"model alignment.\" An\naligned language model should decline a user's request to produce harmful\ncontent. However, such safety measures are vulnerable to adversarial attacks,\nwhich add maliciously designed token sequences to a harmful prompt to bypass\nthe model's safety guards. In this work, we introduce erase-and-check, the\nfirst framework to defend against adversarial prompts with verifiable safety\nguarantees. We defend against three attack modes: i) adversarial suffix, which\nappends an adversarial sequence at the end of the prompt; ii) adversarial\ninsertion, where the adversarial sequence is inserted anywhere in the middle of\nthe prompt; and iii) adversarial infusion, where adversarial tokens are\ninserted at arbitrary positions in the prompt, not necessarily as a contiguous\nblock. Our experimental results demonstrate that this procedure can obtain\nstrong certified safety guarantees on harmful prompts while maintaining good\nempirical performance on safe prompts. For example, against adversarial\nsuffixes of length 20, it certifiably detects 92% of harmful prompts and labels\n94% of safe prompts correctly using the open-source language model Llama 2 as\nthe safety filter. We further improve the filter's performance, in terms of\naccuracy and speed, by replacing Llama 2 with a DistilBERT safety classifier\nfine-tuned on safe and harmful prompts. Additionally, we propose two efficient\nempirical defenses: i) RandEC, a randomized version of erase-and-check that\nevaluates the safety filter on a small subset of the erased subsequences, and\nii) GradEC, a gradient-based version that optimizes the erased tokens to remove\nthe adversarial sequence. The code for our experiments is available at\nhttps://github.com/aounon/certified-llm-safety.\n","authors":["Aounon Kumar","Chirag Agarwal","Suraj Srinivas","Aaron Jiaxun Li","Soheil Feizi","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2309.02705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16416v1","updated":"2023-11-28T01:49:51Z","published":"2023-11-28T01:49:51Z","title":"A Combinatorial Approach to Robust PCA","summary":" We study the problem of recovering Gaussian data under adversarial\ncorruptions when the noises are low-rank and the corruptions are on the\ncoordinate level. Concretely, we assume that the Gaussian noises lie in an\nunknown $k$-dimensional subspace $U \\subseteq \\mathbb{R}^d$, and $s$ randomly\nchosen coordinates of each data point fall into the control of an adversary.\nThis setting models the scenario of learning from high-dimensional yet\nstructured data that are transmitted through a highly-noisy channel, so that\nthe data points are unlikely to be entirely clean.\n Our main result is an efficient algorithm that, when $ks^2 = O(d)$, recovers\nevery single data point up to a nearly-optimal $\\ell_1$ error of $\\tilde\nO(ks/d)$ in expectation. At the core of our proof is a new analysis of the\nwell-known Basis Pursuit (BP) method for recovering a sparse signal, which is\nknown to succeed under additional assumptions (e.g., incoherence or the\nrestricted isometry property) on the underlying subspace $U$. In contrast, we\npresent a novel approach via studying a natural combinatorial problem and show\nthat, over the randomness in the support of the sparse signal, a\nhigh-probability error bound is possible even if the subspace $U$ is arbitrary.\n","authors":["Weihao Kong","Mingda Qiao","Rajat Sen"],"pdf_url":"https://arxiv.org/pdf/2311.16416v1.pdf","comment":"To appear at ITCS 2024"},{"id":"http://arxiv.org/abs/2206.01370v3","updated":"2023-11-28T01:01:54Z","published":"2022-06-03T02:41:59Z","title":"Towards Improving the Generation Quality of Autoregressive Slot VAEs","summary":" Unconditional scene inference and generation are challenging to learn jointly\nwith a single compositional model. Despite encouraging progress on models that\nextract object-centric representations (''slots'') from images, unconditional\ngeneration of scenes from slots has received less attention. This is primarily\nbecause learning the multi-object relations necessary to imagine coherent\nscenes is difficult. We hypothesize that most existing slot-based models have a\nlimited ability to learn object correlations. We propose two improvements that\nstrengthen object correlation learning. The first is to condition the slots on\na global, scene-level variable that captures higher-order correlations between\nslots. Second, we address the fundamental lack of a canonical order for objects\nin images by proposing to learn a consistent order to use for the\nautoregressive generation of scene objects. Specifically, we train an\nautoregressive slot prior to sequentially generate scene objects following a\nlearned order. Ordered slot inference entails first estimating a randomly\nordered set of slots using existing approaches for extracting slots from\nimages, then aligning those slots to ordered slots generated autoregressively\nwith the slot prior. Our experiments across three multi-object environments\ndemonstrate clear gains in unconditional scene generation quality. Detailed\nablation studies are also provided that validate the two proposed improvements.\n","authors":["Patrick Emami","Pan He","Sanjay Ranka","Anand Rangarajan"],"pdf_url":"https://arxiv.org/pdf/2206.01370v3.pdf","comment":"Published in Neural Computation. 38 pages, 18 figures. Code and\n videos available at https://github.com/pemami4911/segregate-relate-imagine"},{"id":"http://arxiv.org/abs/2306.13053v2","updated":"2023-11-28T00:53:55Z","published":"2023-06-22T17:20:30Z","title":"Context-lumpable stochastic bandits","summary":" We consider a contextual bandit problem with $S$ contexts and $K$ actions. In\neach round $t=1,2,\\dots$, the learner observes a random context and chooses an\naction based on its past experience. The learner then observes a random reward\nwhose mean is a function of the context and the action for the round. Under the\nassumption that the contexts can be lumped into $r\\le \\min\\{S,K\\}$ groups such\nthat the mean reward for the various actions is the same for any two contexts\nthat are in the same group, we give an algorithm that outputs an\n$\\epsilon$-optimal policy after using at most $\\widetilde O(r (S +K\n)/\\epsilon^2)$ samples with high probability and provide a matching\n$\\Omega(r(S+K)/\\epsilon^2)$ lower bound. In the regret minimization setting, we\ngive an algorithm whose cumulative regret up to time $T$ is bounded by\n$\\widetilde O(\\sqrt{r^3(S+K)T})$. To the best of our knowledge, we are the\nfirst to show the near-optimal sample complexity in the PAC setting and\n$\\widetilde O(\\sqrt{{poly}(r)(S+K)T})$ minimax regret in the online setting for\nthis problem. We also show our algorithms can be applied to more general\nlow-rank bandits and get improved regret bounds in some scenarios.\n","authors":["Chung-Wei Lee","Qinghua Liu","Yasin Abbasi-Yadkori","Chi Jin","Tor Lattimore","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2306.13053v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.03017v3","updated":"2023-11-28T00:31:49Z","published":"2022-07-06T23:51:44Z","title":"ACHO: Adaptive Conformal Hyperparameter Optimization","summary":" Several novel frameworks for hyperparameter search have emerged in the last\ndecade, but most rely on strict, often normal, distributional assumptions,\nlimiting search model flexibility. This paper proposes a novel optimization\nframework based on upper confidence bound sampling of conformal confidence\nintervals, whose weaker assumption of exchangeability enables greater choice of\nsearch model architectures. Several such architectures were explored and\nbenchmarked on hyperparameter search of random forests and convolutional neural\nnetworks, displaying satisfactory interval coverage and superior tuning\nperformance to random search.\n","authors":["Riccardo Doyle"],"pdf_url":"https://arxiv.org/pdf/2207.03017v3.pdf","comment":"12 pages, 4 tables, 4 figures"},{"id":"http://arxiv.org/abs/2303.09373v2","updated":"2023-11-28T00:07:12Z","published":"2023-03-16T15:01:50Z","title":"MAPSeg: Unified Unsupervised Domain Adaptation for Heterogeneous Medical\n Image Segmentation Based on 3D Masked Autoencoding and Pseudo-Labeling","summary":" Robust segmentation is critical for deriving quantitative measures from\nlarge-scale, multi-center, and longitudinal medical scans. Manually annotating\nmedical scans, however, is expensive and labor-intensive and may not always be\navailable in every domain. Unsupervised domain adaptation (UDA) is a\nwell-studied technique that alleviates this label-scarcity problem by\nleveraging available labels from another domain. In this study, we introduce\nMasked Autoencoding and Pseudo-Labeling Segmentation (MAPSeg), a\n$\\textbf{unified}$ UDA framework with great versatility and superior\nperformance for heterogeneous and volumetric medical image segmentation. To the\nbest of our knowledge, this is the first study that systematically reviews and\ndevelops a framework to tackle four different domain shifts in medical image\nsegmentation. More importantly, MAPSeg is the first framework that can be\napplied to $\\textbf{centralized}$, $\\textbf{federated}$, and\n$\\textbf{test-time}$ UDA while maintaining comparable performance. We compare\nMAPSeg with previous state-of-the-art methods on a private infant brain MRI\ndataset and a public cardiac CT-MRI dataset, and MAPSeg outperforms others by a\nlarge margin (10.5 Dice improvement on the private MRI dataset and 5.7 on the\npublic CT-MRI dataset). MAPSeg poses great practical value and can be applied\nto real-world problems. Our code and pretrained model will be available later.\n","authors":["Xuzhe Zhang","Yuhao Wu","Elsa Angelini","Ang Li","Jia Guo","Jerod M. Rasmussen","Thomas G. O'Connor","Pathik D. Wadhwa","Andrea Parolin Jackowski","Hai Li","Jonathan Posner","Andrew F. Laine","Yun Wang"],"pdf_url":"https://arxiv.org/pdf/2303.09373v2.pdf","comment":"16 pages and 7 figures. Revised and extended to test-time and\n federated domain adaptation. Xuzhe Zhang and Yuhao Wu are co-first authors.\n Andrew F. Laine and Yun Wang are co-senior supervising authors"},{"id":"http://arxiv.org/abs/2311.16381v1","updated":"2023-11-28T00:03:18Z","published":"2023-11-28T00:03:18Z","title":"Deep Learning for Time Series Classification of Parkinson's Disease Eye\n Tracking Data","summary":" Eye-tracking is an accessible and non-invasive technology that provides\ninformation about a subject's motor and cognitive abilities. As such, it has\nproven to be a valuable resource in the study of neurodegenerative diseases\nsuch as Parkinson's disease. Saccade experiments, in particular, have proven\nuseful in the diagnosis and staging of Parkinson's disease. However, to date,\nno single eye-movement biomarker has been found to conclusively differentiate\npatients from healthy controls. In the present work, we investigate the use of\nstate-of-the-art deep learning algorithms to perform Parkinson's disease\nclassification using eye-tracking data from saccade experiments. In contrast to\nprevious work, instead of using hand-crafted features from the saccades, we use\nraw $\\sim1.5\\,s$ long fixation intervals recorded during the preparatory phase\nbefore each trial. Using these short time series as input we implement two\ndifferent classification models, InceptionTime and ROCKET. We find that the\nmodels are able to learn the classification task and generalize to unseen\nsubjects. InceptionTime achieves $78\\%$ accuracy, while ROCKET achieves $88\\%$\naccuracy. We also employ a novel method for pruning the ROCKET model to improve\ninterpretability and generalizability, achieving an accuracy of $96\\%$. Our\nresults suggest that fixation data has low inter-subject variability and\npotentially carries useful information about brain cognitive and motor\nconditions, making it suitable for use with machine learning in the discovery\nof disease-relevant biomarkers.\n","authors":["Gonzalo Uribarri","Simon Ekman von Huth","Josefine Waldthaler","Per Svenningsson","Erik Fransén"],"pdf_url":"https://arxiv.org/pdf/2311.16381v1.pdf","comment":"Extended Abstract presented at Machine Learning for Health (ML4H)\n symposium 2023, December 10th, 2023, New Orleans, United States, 12 pages"},{"id":"http://arxiv.org/abs/2311.17279v1","updated":"2023-11-28T23:38:42Z","published":"2023-11-28T23:38:42Z","title":"LiveTune: Dynamic Parameter Tuning for Training Deep Neural Networks","summary":" Traditional machine learning training is a static process that lacks\nreal-time adaptability of hyperparameters. Popular tuning solutions during\nruntime involve checkpoints and schedulers. Adjusting hyper-parameters usually\nrequire the program to be restarted, wasting utilization and time, while\nplacing unnecessary strain on memory and processors. We present LiveTune, a new\nframework allowing real-time parameter tuning during training through\nLiveVariables. Live Variables allow for a continuous training session by\nstoring parameters on designated ports on the system, allowing them to be\ndynamically adjusted. Extensive evaluations of our framework show saving up to\n60 seconds and 5.4 Kilojoules of energy per hyperparameter change.\n","authors":["Soheil Zibakhsh Shabgahi","Nojan Sheybani","Aiden Tabrizi","Farinaz Koushanfar"],"pdf_url":"https://arxiv.org/pdf/2311.17279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17277v1","updated":"2023-11-28T23:33:16Z","published":"2023-11-28T23:33:16Z","title":"An Online Optimization-Based Decision Support Tool for Small Farmers in\n India: Learning in Non-stationary Environments","summary":" Crop management decision support systems are specialized tools for farmers\nthat reduce the riskiness of revenue streams, especially valuable for use under\nthe current climate changes that impact agricultural productivity.\nUnfortunately, small farmers in India, who could greatly benefit from these\ntools, do not have access to them. In this paper, we model an individual\ngreenhouse as a Markov Decision Process (MDP) and adapt Li and Li (2019)'s\nFollow the Weighted Leader (FWL) online learning algorithm to offer crop\nplanning advice. We successfully produce utility-preserving cropping pattern\nsuggestions in simulations. When we compare against an offline planning\nalgorithm, we achieve the same cumulative revenue with greatly reduced runtime.\n","authors":["Tuxun Lu","Aviva Prins"],"pdf_url":"https://arxiv.org/pdf/2311.17277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09511v2","updated":"2023-11-28T22:59:41Z","published":"2023-11-16T02:32:26Z","title":"Identifying Systems with Symmetries using Equivariant Autoregressive\n Reservoir Computers","summary":" The investigation reported in this document focuses on identifying systems\nwith symmetries using equivariant autoregressive reservoir computers. General\nresults in structured matrix approximation theory are presented, exploring a\ntwo-fold approach. Firstly, a comprehensive examination of generic\nsymmetry-preserving nonlinear time delay embedding is conducted. This involves\nanalyzing time series data sampled from an equivariant system under study.\nSecondly, sparse least-squares methods are applied to discern approximate\nrepresentations of the output coupling matrices. These matrices play a pivotal\nrole in determining the nonlinear autoregressive representation of an\nequivariant system. The structural characteristics of these matrices are\ndictated by the set of symmetries inherent in the system. The document outlines\nprototypical algorithms derived from the described techniques, offering insight\ninto their practical applications. Emphasis is placed on their effectiveness in\nthe identification and predictive simulation of equivariant nonlinear systems,\nregardless of whether such systems exhibit chaotic behavior.\n","authors":["Fredy Vides","Idelfonso B. R. Nogueira","Lendy Banegas","Evelyn Flores"],"pdf_url":"https://arxiv.org/pdf/2311.09511v2.pdf","comment":"The views expressed in the article do not necessarily represent the\n views of the National Commission of Banks and Insurance Companies of Honduras"},{"id":"http://arxiv.org/abs/2311.17259v1","updated":"2023-11-28T22:48:00Z","published":"2023-11-28T22:48:00Z","title":"SoUnD Framework: Analyzing (So)cial Representation in (Un)structured\n (D)ata","summary":" The unstructured nature of data used in foundation model development is a\nchallenge to systematic analyses for making data use and documentation\ndecisions. From a Responsible AI perspective, these decisions often rely upon\nunderstanding how people are represented in data. We propose a framework\ndesigned to guide analysis of human representation in unstructured data and\nidentify downstream risks. We apply the framework in two toy examples using the\nCommon Crawl web text corpus (C4) and LAION-400M. We also propose a set of\nhypothetical action steps in service of dataset use, development, and\ndocumentation.\n","authors":["Mark Díaz","Sunipa Dev","Emily Reif","Remi Denton","Vinodkumar Prabhakaran"],"pdf_url":"https://arxiv.org/pdf/2311.17259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06703v2","updated":"2023-11-28T22:37:39Z","published":"2023-08-13T07:03:22Z","title":"Understanding the robustness difference between stochastic gradient\n descent and adaptive gradient methods","summary":" Stochastic gradient descent (SGD) and adaptive gradient methods, such as Adam\nand RMSProp, have been widely used in training deep neural networks. We\nempirically show that while the difference between the standard generalization\nperformance of models trained using these methods is small, those trained using\nSGD exhibit far greater robustness under input perturbations. Notably, our\ninvestigation demonstrates the presence of irrelevant frequencies in natural\ndatasets, where alterations do not affect models' generalization performance.\nHowever, models trained with adaptive methods show sensitivity to these\nchanges, suggesting that their use of irrelevant frequencies can lead to\nsolutions sensitive to perturbations. To better understand this difference, we\nstudy the learning dynamics of gradient descent (GD) and sign gradient descent\n(signGD) on a synthetic dataset that mirrors natural signals. With a\nthree-dimensional input space, the models optimized with GD and signGD have\nstandard risks close to zero but vary in their adversarial risks. Our result\nshows that linear models' robustness to $\\ell_2$-norm bounded changes is\ninversely proportional to the model parameters' weight norm: a smaller weight\nnorm implies better robustness. In the context of deep learning, our\nexperiments show that SGD-trained neural networks have smaller Lipschitz\nconstants, explaining the better robustness to input perturbations than those\ntrained with adaptive gradient methods.\n","authors":["Avery Ma","Yangchen Pan","Amir-massoud Farahmand"],"pdf_url":"https://arxiv.org/pdf/2308.06703v2.pdf","comment":"Accepted at TMLR (Featured Certification). Code: see\n https://github.com/averyma/opt-robust"},{"id":"http://arxiv.org/abs/2311.17250v1","updated":"2023-11-28T22:11:15Z","published":"2023-11-28T22:11:15Z","title":"Fourier Neural Differential Equations for learning Quantum Field\n Theories","summary":" A Quantum Field Theory is defined by its interaction Hamiltonian, and linked\nto experimental data by the scattering matrix. The scattering matrix is\ncalculated as a perturbative series, and represented succinctly as a first\norder differential equation in time. Neural Differential Equations (NDEs) learn\nthe time derivative of a residual network's hidden state, and have proven\nefficacy in learning differential equations with physical constraints. Hence\nusing an NDE to learn particle scattering matrices presents a possible\nexperiment-theory phenomenological connection. In this paper, NDE models are\nused to learn $\\phi^4$ theory, Scalar-Yukawa theory and Scalar Quantum\nElectrodynamics. A new NDE architecture is also introduced, the Fourier Neural\nDifferential Equation (FNDE), which combines NDE integration and Fourier\nnetwork convolution. The FNDE model demonstrates better generalisability than\nthe non-integrated equivalent FNO model. It is also shown that by training on\nscattering data, the interaction Hamiltonian of a theory can be extracted from\nnetwork parameters.\n","authors":["Isaac Brant","Alexander Norcliffe","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2311.17250v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.13868v2","updated":"2023-11-28T21:59:49Z","published":"2023-07-26T00:01:16Z","title":"Learning sources of variability from high-dimensional observational\n studies","summary":" Causal inference studies whether the presence of a variable influences an\nobserved outcome. As measured by quantities such as the \"average treatment\neffect,\" this paradigm is employed across numerous biological fields, from\nvaccine and drug development to policy interventions. Unfortunately, the\nmajority of these methods are often limited to univariate outcomes. Our work\ngeneralizes causal estimands to outcomes with any number of dimensions or any\nmeasurable space, and formulates traditional causal estimands for nominal\nvariables as causal discrepancy tests. We propose a simple technique for\nadjusting universally consistent conditional independence tests and prove that\nthese tests are universally consistent causal discrepancy tests. Numerical\nexperiments illustrate that our method, Causal CDcorr, leads to improvements in\nboth finite sample validity and power when compared to existing strategies. Our\nmethods are all open source and available at github.com/ebridge2/cdcorr.\n","authors":["Eric W. Bridgeford","Jaewon Chung","Brian Gilbert","Sambit Panda","Adam Li","Cencheng Shen","Alexandra Badea","Brian Caffo","Joshua T. Vogelstein"],"pdf_url":"https://arxiv.org/pdf/2307.13868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11695v3","updated":"2023-11-28T21:36:53Z","published":"2023-01-27T13:10:45Z","title":"LegendreTron: Uprising Proper Multiclass Loss Learning","summary":" Loss functions serve as the foundation of supervised learning and are often\nchosen prior to model development. To avoid potentially ad hoc choices of\nlosses, statistical decision theory describes a desirable property for losses\nknown as \\emph{properness}, which asserts that Bayes' rule is optimal. Recent\nworks have sought to \\emph{learn losses} and models jointly. Existing methods\ndo this by fitting an inverse canonical link function which monotonically maps\n$\\mathbb{R}$ to $[0,1]$ to estimate probabilities for binary problems. In this\npaper, we extend monotonicity to maps between $\\mathbb{R}^{C-1}$ and the\nprojected probability simplex $\\tilde{\\Delta}^{C-1}$ by using monotonicity of\ngradients of convex functions. We present {\\sc LegendreTron} as a novel and\npractical method that jointly learns \\emph{proper canonical losses} and\nprobabilities for multiclass problems. Tested on a benchmark of domains with up\nto 1,000 classes, our experimental results show that our method consistently\noutperforms the natural multiclass baseline under a $t$-test at 99%\nsignificance on all datasets with greater than 10 classes.\n","authors":["Kevin Lam","Christian Walder","Spiridon Penev","Richard Nock"],"pdf_url":"https://arxiv.org/pdf/2301.11695v3.pdf","comment":"Accepted at the 40th International Conference on Machine Learning\n (ICML 2023)"},{"id":"http://arxiv.org/abs/2308.01050v4","updated":"2023-11-28T21:23:04Z","published":"2023-08-02T09:48:08Z","title":"A Counterfactual Safety Margin Perspective on the Scoring of Autonomous\n Vehicles' Riskiness","summary":" Autonomous Vehicles (AVs) promise a range of societal advantages, including\nbroader access to mobility, reduced road accidents, and enhanced transportation\nefficiency. However, evaluating the risks linked to AVs is complex due to\nlimited historical data and the swift progression of technology. This paper\npresents a data-driven framework for assessing the risk of different AVs'\nbehaviors in various operational design domains (ODDs), based on counterfactual\nsimulations of \"misbehaving\" road users. We propose the notion of\ncounterfactual safety margin, which represents the minimum deviation from\nnominal behavior that could cause a collision. This methodology not only\npinpoints the most critical scenarios but also quantifies the (relative) risk's\nfrequency and severity concerning AVs. Importantly, we show that our approach\nis applicable even when the AV's behavioral policy remains undisclosed, through\nworst- and best-case analyses, benefiting external entities like regulators and\nrisk evaluators. Our experimental outcomes demonstrate the correlation between\nthe safety margin, the quality of the driving policy, and the ODD, shedding\nlight on the relative risks of different AV providers. Overall, this work\ncontributes to the safety assessment of AVs and addresses legislative and\ninsurance concerns surrounding this burgeoning technology.\n","authors":["Alessandro Zanardi","Andrea Censi","Margherita Atzei","Luigi Di Lillo","Emilio Frazzoli"],"pdf_url":"https://arxiv.org/pdf/2308.01050v4.pdf","comment":"updated experiments"},{"id":"http://arxiv.org/abs/2306.15832v2","updated":"2023-11-28T21:18:38Z","published":"2023-06-27T23:33:30Z","title":"Easing Color Shifts in Score-Based Diffusion Models","summary":" Generated images of score-based models can suffer from errors in their\nspatial means, an effect, referred to as a color shift, which grows for larger\nimages. This paper investigates a previously-introduced approach to mitigate\ncolor shifts in score-based diffusion models. We quantify the performance of a\nnonlinear bypass connection in the score network, designed to process the\nspatial mean of the input and to predict the mean of the score function. We\nshow that this network architecture substantially improves the resulting\nquality of the generated images, and that this improvement is approximately\nindependent of the size of the generated images. As a result, this modified\narchitecture offers a simple solution for the color shift problem across image\nsizes. We additionally discuss the origin of color shifts in an idealized\nsetting in order to motivate the approach.\n","authors":["Katherine Deck","Tobias Bischoff"],"pdf_url":"https://arxiv.org/pdf/2306.15832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17233v1","updated":"2023-11-28T21:15:24Z","published":"2023-11-28T21:15:24Z","title":"Quantifying the redundancy between prosody and text","summary":" Prosody -- the suprasegmental component of speech, including pitch, loudness,\nand tempo -- carries critical aspects of meaning. However, the relationship\nbetween the information conveyed by prosody vs. by the words themselves remains\npoorly understood. We use large language models (LLMs) to estimate how much\ninformation is redundant between prosody and the words themselves. Using a\nlarge spoken corpus of English audiobooks, we extract prosodic features aligned\nto individual words and test how well they can be predicted from LLM\nembeddings, compared to non-contextual word embeddings. We find a high degree\nof redundancy between the information carried by the words and prosodic\ninformation across several prosodic features, including intensity, duration,\npauses, and pitch contours. Furthermore, a word's prosodic information is\nredundant with both the word itself and the context preceding as well as\nfollowing it. Still, we observe that prosodic features can not be fully\npredicted from text, suggesting that prosody carries information above and\nbeyond the words. Along with this paper, we release a general-purpose data\nprocessing pipeline for quantifying the relationship between linguistic\ninformation and extra-linguistic features.\n","authors":["Lukas Wolf","Tiago Pimentel","Evelina Fedorenko","Ryan Cotterell","Alex Warstadt","Ethan Wilcox","Tamar Regev"],"pdf_url":"https://arxiv.org/pdf/2311.17233v1.pdf","comment":"Published at The 2023 Conference on Empirical Methods in Natural\n Language Processing (EMNLP)"},{"id":"http://arxiv.org/abs/2110.04442v2","updated":"2023-11-28T21:08:05Z","published":"2021-10-09T03:25:01Z","title":"A Primer on Deep Learning for Causal Inference","summary":" This review systematizes the emerging literature for causal inference using\ndeep neural networks under the potential outcomes framework. It provides an\nintuitive introduction on how deep learning can be used to estimate/predict\nheterogeneous treatment effects and extend causal inference to settings where\nconfounding is non-linear, time varying, or encoded in text, networks, and\nimages. To maximize accessibility, we also introduce prerequisite concepts from\ncausal inference and deep learning. The survey differs from other treatments of\ndeep learning and causal inference in its sharp focus on observational causal\nestimation, its extended exposition of key algorithms, and its detailed\ntutorials for implementing, training, and selecting among deep estimators in\nTensorflow 2 available at github.com/kochbj/Deep-Learning-for-Causal-Inference.\n","authors":["Bernard Koch","Tim Sainburg","Pablo Geraldo","Song Jiang","Yizhou Sun","Jacob Gates Foster"],"pdf_url":"https://arxiv.org/pdf/2110.04442v2.pdf","comment":"Forthcoming in Sociological Methods and Research"},{"id":"http://arxiv.org/abs/2110.14053v6","updated":"2023-11-28T21:05:18Z","published":"2021-10-26T22:08:22Z","title":"NeuroBack: Improving CDCL SAT Solving using Graph Neural Networks","summary":" Propositional satisfiability (SAT) is an NP-complete problem that impacts\nmany research fields, such as planning, verification, and security. Mainstream\nmodern SAT solvers are based on the Conflict-Driven Clause Learning (CDCL)\nalgorithm. Recent work aimed to enhance CDCL SAT solvers using Graph Neural\nNetworks (GNNs). However, so far this approach either has not made solving more\neffective, or required substantial GPU resources for frequent online model\ninferences. Aiming to make GNN improvements practical, this paper proposes an\napproach called NeuroBack, which builds on two insights: (1) predicting phases\n(i.e., values) of variables appearing in the majority (or even all) of the\nsatisfying assignments are essential for CDCL SAT solving, and (2) it is\nsufficient to query the neural model only once for the predictions before the\nSAT solving starts. Once trained, the offline model inference allows NeuroBack\nto execute exclusively on the CPU, removing its reliance on GPU resources. To\ntrain NeuroBack, a new dataset called DataBack containing 120,286 data samples\nis created. Finally, NeuroBack is implemented as an enhancement to a\nstate-of-the-art SAT solver called Kissat. As a result, it allowed Kissat to\nsolve 5.2% more problems on the recent SAT competition problem set,\nSATCOMP-2022. NeuroBack therefore shows how machine learning can be harnessed\nto improve SAT solving in an effective and practical manner.\n","authors":["Wenxi Wang","Yang Hu","Mohit Tiwari","Sarfraz Khurshid","Kenneth McMillan","Risto Miikkulainen"],"pdf_url":"https://arxiv.org/pdf/2110.14053v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14743v2","updated":"2023-11-28T21:04:36Z","published":"2023-11-21T18:41:26Z","title":"A Baseline Analysis of Reward Models' Ability To Accurately Analyze\n Foundation Models Under Distribution Shift","summary":" Foundation models, specifically Large Language Models (LLM's), have lately\ngained wide-spread attention and adoption. Reinforcement Learning with Human\nFeedback (RLHF) involves training a reward model to capture desired behaviors,\nwhich is then used to align an LLM. These reward models are additionally used\nat inference-time to estimate how well LLM responses adhere to those desired\nbehaviors. However, there is little work measuring how robust these reward\nmodels are to distribution shifts. In this work, we evaluate how reward model\nperformance - measured via accuracy and calibration (i.e. alignment between\naccuracy and confidence) - is affected by distribution shift. We show novel\ncalibration patterns and accuracy drops due to OOD prompts and responses, and\nthat the reward model is more sensitive to shifts in responses than prompts.\nAdditionally, we adapt an OOD detection technique commonly used in\nclassification to the reward model setting in order to detect these\ndistribution shifts in prompts and responses.\n","authors":["Ben Pikus","Will LeVine","Tony Chen","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2311.14743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17225v1","updated":"2023-11-28T20:57:10Z","published":"2023-11-28T20:57:10Z","title":"Invariance assumptions for class distribution estimation","summary":" We study the problem of class distribution estimation under dataset shift. On\nthe training dataset, both features and class labels are observed while on the\ntest dataset only the features can be observed. The task then is the estimation\nof the distribution of the class labels, i.e. the estimation of the class prior\nprobabilities, in the test dataset. Assumptions of invariance between the\ntraining joint distribution of features and labels and the test distribution\ncan considerably facilitate this task. We discuss the assumptions of covariate\nshift, factorizable joint shift, and sparse joint shift and their implications\nfor class distribution estimation.\n","authors":["Dirk Tasche"],"pdf_url":"https://arxiv.org/pdf/2311.17225v1.pdf","comment":"16 pages, presented at workshop Learning to Quantify: Methods and\n Applications (LQ 2023), Torino, September 18, 2023"},{"id":"http://arxiv.org/abs/2311.17218v1","updated":"2023-11-28T20:42:30Z","published":"2023-11-28T20:42:30Z","title":"BIM: Block-Wise Self-Supervised Learning with Masked Image Modeling","summary":" Like masked language modeling (MLM) in natural language processing, masked\nimage modeling (MIM) aims to extract valuable insights from image patches to\nenhance the feature extraction capabilities of the underlying deep neural\nnetwork (DNN). Contrasted with other training paradigms like supervised\nlearning and unsupervised contrastive learning, masked image modeling (MIM)\npretraining typically demands significant computational resources in order to\nmanage large training data batches (e.g., 4096). The significant memory and\ncomputation requirements pose a considerable challenge to its broad adoption.\nTo mitigate this, we introduce a novel learning framework,\ntermed~\\textit{Block-Wise Masked Image Modeling} (BIM). This framework involves\ndecomposing the MIM tasks into several sub-tasks with independent computation\npatterns, resulting in block-wise back-propagation operations instead of the\ntraditional end-to-end approach. Our proposed BIM maintains superior\nperformance compared to conventional MIM while greatly reducing peak memory\nconsumption. Moreover, BIM naturally enables the concurrent training of\nnumerous DNN backbones of varying depths. This leads to the creation of\nmultiple trained DNN backbones, each tailored to different hardware platforms\nwith distinct computing capabilities. This approach significantly reduces\ncomputational costs in comparison with training each DNN backbone individually.\nOur framework offers a promising solution for resource constrained training of\nMIM.\n","authors":["Yixuan Luo","Mengye Ren","Sai Qian Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.17218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17204v1","updated":"2023-11-28T20:18:42Z","published":"2023-11-28T20:18:42Z","title":"Optimal EEG Electrode Set for Emotion Recognition From Brain Signals: An\n Empirical Quest","summary":" The human brain is a complex organ, still completely undiscovered, that\ncontrols almost all the parts of the body. Apart from survival, the human brain\nstimulates emotions. Recent research indicates that brain signals can be very\neffective for emotion recognition. However, which parts of the brain exhibit\nmost of the emotions is still under-explored. In this study, we empirically\nanalyze the contribution of each part of the brain in exhibiting emotions. We\nuse the DEAP dataset to find the most optimal electrode set which eventually\nleads to the effective brain part associated with emotions. We use Fast Fourier\nTransformation for effective feature extraction and a 1D-CNN with residual\nconnection for classification. Though 32 electrodes from the DEAP dataset got\nan accuracy of 97.34%, only 12 electrodes (F7, P8, O1, F8, C4, T7, PO3, Fp1,\nFp2, O2, P3, and Fz) achieve 95.81% accuracy. This study also shows that adding\nmore than 10 electrodes does not improve performance significantly. Moreover,\nthe frontal lobe is the most important for recognizing emotion.\n","authors":["Rumman Ahmed Prodhan","Sumya Akter","Tanmoy Sarkar Pias","Md. Akhtaruzzaman Adnan"],"pdf_url":"https://arxiv.org/pdf/2311.17204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11952v3","updated":"2023-11-28T20:13:20Z","published":"2022-05-24T10:32:32Z","title":"3D helical CT Reconstruction with a Memory Efficient Learned Primal-Dual\n Architecture","summary":" Deep learning based computed tomography (CT) reconstruction has demonstrated\noutstanding performance on simulated 2D low-dose CT data. This applies in\nparticular to domain adapted neural networks, which incorporate a handcrafted\nphysics model for CT imaging. Empirical evidence shows that employing such\narchitectures reduces the demand for training data and improves upon\ngeneralisation. However, their training requires large computational resources\nthat quickly become prohibitive in 3D helical CT, which is the most common\nacquisition geometry used for medical imaging. Furthermore, clinical data also\ncomes with other challenges not accounted for in simulations, like errors in\nflux measurement, resolution mismatch and, most importantly, the absence of the\nreal ground truth. The necessity to have a computationally feasible training\ncombined with the need to address these issues has made it difficult to\nevaluate deep learning based reconstruction on clinical 3D helical CT. This\npaper modifies a domain adapted neural network architecture, the Learned\nPrimal-Dual (LPD), so that it can be trained and applied to reconstruction in\nthis setting. We achieve this by splitting the helical trajectory into sections\nand applying the unrolled LPD iterations to those sections sequentially. To the\nbest of our knowledge, this work is the first to apply an unrolled deep\nlearning architecture for reconstruction on full-sized clinical data, like\nthose in the Low dose CT image and projection data set (LDCT). Moreover,\ntraining and testing is done on a single GPU card with 24GB of memory.\n","authors":["Jevgenija Rudzusika","Buda Bajić","Thomas Koehler","Ozan Öktem"],"pdf_url":"https://arxiv.org/pdf/2205.11952v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17190v1","updated":"2023-11-28T19:34:40Z","published":"2023-11-28T19:34:40Z","title":"Minimax Exploiter: A Data Efficient Approach for Competitive Self-Play","summary":" Recent advances in Competitive Self-Play (CSP) have achieved, or even\nsurpassed, human level performance in complex game environments such as Dota 2\nand StarCraft II using Distributed Multi-Agent Reinforcement Learning (MARL).\nOne core component of these methods relies on creating a pool of learning\nagents -- consisting of the Main Agent, past versions of this agent, and\nExploiter Agents -- where Exploiter Agents learn counter-strategies to the Main\nAgents. A key drawback of these approaches is the large computational cost and\nphysical time that is required to train the system, making them impractical to\ndeploy in highly iterative real-life settings such as video game productions.\nIn this paper, we propose the Minimax Exploiter, a game theoretic approach to\nexploiting Main Agents that leverages knowledge of its opponents, leading to\nsignificant increases in data efficiency. We validate our approach in a\ndiversity of settings, including simple turn based games, the arcade learning\nenvironment, and For Honor, a modern video game. The Minimax Exploiter\nconsistently outperforms strong baselines, demonstrating improved stability and\ndata efficiency, leading to a robust CSP-MARL method that is both flexible and\neasy to deploy.\n","authors":["Daniel Bairamian","Philippe Marcotte","Joshua Romoff","Gabriel Robert","Derek Nowrouzezahrai"],"pdf_url":"https://arxiv.org/pdf/2311.17190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01870v2","updated":"2023-11-28T19:26:33Z","published":"2023-10-03T08:15:20Z","title":"DeepDecipher: Accessing and Investigating Neuron Activation in Large\n Language Models","summary":" As large language models (LLMs) become more capable, there is an urgent need\nfor interpretable and transparent tools. Current methods are difficult to\nimplement, and accessible tools to analyze model internals are lacking. To\nbridge this gap, we present DeepDecipher - an API and interface for probing\nneurons in transformer models' MLP layers. DeepDecipher makes the outputs of\nadvanced interpretability techniques for LLMs readily available. The\neasy-to-use interface also makes inspecting these complex models more\nintuitive. This paper outlines DeepDecipher's design and capabilities. We\ndemonstrate how to analyze neurons, compare models, and gain insights into\nmodel behavior. For example, we contrast DeepDecipher's functionality with\nsimilar tools like Neuroscope and OpenAI's Neuron Explainer. DeepDecipher\nenables efficient, scalable analysis of LLMs. By granting access to\nstate-of-the-art interpretability methods, DeepDecipher makes LLMs more\ntransparent, trustworthy, and safe. Researchers, engineers, and developers can\nquickly diagnose issues, audit systems, and advance the field.\n","authors":["Albert Garde","Esben Kran","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2310.01870v2.pdf","comment":"5 pages (9 total), 1 figure, submitted to NeurIPS 2023 Workshop XAIA"},{"id":"http://arxiv.org/abs/2310.12819v2","updated":"2023-11-28T19:23:22Z","published":"2023-10-19T15:16:43Z","title":"Hybrid Search for Efficient Planning with Completeness Guarantees","summary":" Solving complex planning problems has been a long-standing challenge in\ncomputer science. Learning-based subgoal search methods have shown promise in\ntackling these problems, but they often suffer from a lack of completeness\nguarantees, meaning that they may fail to find a solution even if one exists.\nIn this paper, we propose an efficient approach to augment a subgoal search\nmethod to achieve completeness in discrete action spaces. Specifically, we\naugment the high-level search with low-level actions to execute a multi-level\n(hybrid) search, which we call complete subgoal search. This solution achieves\nthe best of both worlds: the practical efficiency of high-level search and the\ncompleteness of low-level search. We apply the proposed search method to a\nrecently proposed subgoal search algorithm and evaluate the algorithm trained\non offline data on complex planning problems. We demonstrate that our complete\nsubgoal search not only guarantees completeness but can even improve\nperformance in terms of search expansions for instances that the high-level\ncould solve without low-level augmentations. Our approach makes it possible to\napply subgoal-level planning for systems where completeness is a critical\nrequirement.\n","authors":["Kalle Kujanpää","Joni Pajarinen","Alexander Ilin"],"pdf_url":"https://arxiv.org/pdf/2310.12819v2.pdf","comment":"NeurIPS 2023 Poster"},{"id":"http://arxiv.org/abs/2211.03803v3","updated":"2023-11-28T19:21:04Z","published":"2022-11-07T19:00:02Z","title":"Quantum-probabilistic Hamiltonian learning for generative modelling &\n anomaly detection","summary":" The Hamiltonian of an isolated quantum mechanical system determines its\ndynamics and physical behaviour. This study investigates the possibility of\nlearning and utilising a system's Hamiltonian and its variational thermal state\nestimation for data analysis techniques. For this purpose, we employ the method\nof Quantum Hamiltonian-based models for the generative modelling of simulated\nLarge Hadron Collider data and demonstrate the representability of such data as\na mixed state. In a further step, we use the learned Hamiltonian for anomaly\ndetection, showing that different sample types can form distinct dynamical\nbehaviours once treated as a quantum many-body system. We exploit these\ncharacteristics to quantify the difference between sample types. Our findings\nshow that the methodologies designed for field theory computations can be\nutilised in machine learning applications to employ theoretical approaches in\ndata analysis techniques.\n","authors":["Jack Y. Araz","Michael Spannowsky"],"pdf_url":"https://arxiv.org/pdf/2211.03803v3.pdf","comment":"14 pages, 7 figures. Accepted version for publication"},{"id":"http://arxiv.org/abs/2311.17179v1","updated":"2023-11-28T19:14:40Z","published":"2023-11-28T19:14:40Z","title":"SatCLIP: Global, General-Purpose Location Embeddings with Satellite\n Imagery","summary":" Geographic location is essential for modeling tasks in fields ranging from\necology to epidemiology to the Earth system sciences. However, extracting\nrelevant and meaningful characteristics of a location can be challenging, often\nentailing expensive data fusion or data distillation from global imagery\ndatasets. To address this challenge, we introduce Satellite Contrastive\nLocation-Image Pretraining (SatCLIP), a global, general-purpose geographic\nlocation encoder that learns an implicit representation of locations from\nopenly available satellite imagery. Trained location encoders provide vector\nembeddings summarizing the characteristics of any given location for convenient\nusage in diverse downstream tasks. We show that SatCLIP embeddings, pretrained\non globally sampled multi-spectral Sentinel-2 satellite data, can be used in\nvarious predictive tasks that depend on location information but not\nnecessarily satellite imagery, including temperature prediction, animal\nrecognition in imagery, and population density estimation. Across tasks,\nSatCLIP embeddings consistently outperform embeddings from existing pretrained\nlocation encoders, ranging from models trained on natural images to models\ntrained on semantic context. SatCLIP embeddings also help to improve geographic\ngeneralization. This demonstrates the potential of general-purpose location\nencoders and opens the door to learning meaningful representations of our\nplanet from the vast, varied, and largely untapped modalities of geospatial\ndata.\n","authors":["Konstantin Klemmer","Esther Rolf","Caleb Robinson","Lester Mackey","Marc Rußwurm"],"pdf_url":"https://arxiv.org/pdf/2311.17179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17173v1","updated":"2023-11-28T19:07:30Z","published":"2023-11-28T19:07:30Z","title":"A personalized Uncertainty Quantification framework for patient survival\n models: estimating individual uncertainty of patients with metastatic brain\n tumors in the absence of ground truth","summary":" TodevelopanovelUncertaintyQuantification (UQ) framework to estimate the\nuncertainty of patient survival models in the absence of ground truth, we\ndeveloped and evaluated our approach based on a dataset of 1383 patients\ntreated with stereotactic radiosurgery (SRS) for brain metastases between\nJanuary 2015 and December 2020. Our motivating hypothesis is that a\ntime-to-event prediction of a test patient on inference is more certain given a\nhigher feature-space-similarity to patients in the training set. Therefore, the\nuncertainty for a particular patient-of-interest is represented by the\nconcordance index between a patient similarity rank and a prediction similarity\nrank. Model uncertainty was defined as the increased percentage of the max\nuncertainty-constrained-AUC compared to the model AUC. We evaluated our method\non multiple clinically-relevant endpoints, including time to intracranial\nprogression (ICP), progression-free survival (PFS) after SRS, overall survival\n(OS), and time to ICP and/or death (ICPD), on a variety of both statistical and\nnon-statistical models, including CoxPH, conditional survival forest (CSF), and\nneural multi-task linear regression (NMTLR). Our results show that all models\nhad the lowest uncertainty on ICP (2.21%) and the highest uncertainty (17.28%)\non ICPD. OS models demonstrated high variation in uncertainty performance,\nwhere NMTLR had the lowest uncertainty(1.96%)and CSF had the highest\nuncertainty (14.29%). In conclusion, our method can estimate the uncertainty of\nindividual patient survival modeling results. As expected, our data empirically\ndemonstrate that as model uncertainty measured via our technique increases, the\nsimilarity between a feature-space and its predicted outcome decreases.\n","authors":["Yuqi Wang","Aarzu Gupta","David Carpenter","Trey Mullikin","Zachary J. Reitman","Scott Floyd","John Kirkpatrick","Joseph K. Salama","Paul W. Sperduto","Jian-Guo Liu","Mustafa R. Bashir","Kyle J. Lafata"],"pdf_url":"https://arxiv.org/pdf/2311.17173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12931v2","updated":"2023-11-28T19:06:49Z","published":"2023-09-22T15:30:53Z","title":"On Separate Normalization in Self-supervised Transformers","summary":" Self-supervised training methods for transformers have demonstrated\nremarkable performance across various domains. Previous transformer-based\nmodels, such as masked autoencoders (MAE), typically utilize a single\nnormalization layer for both the [CLS] symbol and the tokens. We propose in\nthis paper a simple modification that employs separate normalization layers for\nthe tokens and the [CLS] symbol to better capture their distinct\ncharacteristics and enhance downstream task performance. Our method aims to\nalleviate the potential negative effects of using the same normalization\nstatistics for both token types, which may not be optimally aligned with their\nindividual roles. We empirically show that by utilizing a separate\nnormalization layer, the [CLS] embeddings can better encode the global\ncontextual information and are distributed more uniformly in its anisotropic\nspace. When replacing the conventional normalization layer with the two\nseparate layers, we observe an average 2.7% performance improvement over the\nimage, natural language, and graph domains.\n","authors":["Xiaohui Chen","Yinkai Wang","Yuanqi Du","Soha Hassoun","Li-Ping Liu"],"pdf_url":"https://arxiv.org/pdf/2309.12931v2.pdf","comment":"NIPS 2023"},{"id":"http://arxiv.org/abs/2309.01029v3","updated":"2023-11-28T19:04:45Z","published":"2023-09-02T22:14:26Z","title":"Explainability for Large Language Models: A Survey","summary":" Large language models (LLMs) have demonstrated impressive capabilities in\nnatural language processing. However, their internal mechanisms are still\nunclear and this lack of transparency poses unwanted risks for downstream\napplications. Therefore, understanding and explaining these models is crucial\nfor elucidating their behaviors, limitations, and social impacts. In this\npaper, we introduce a taxonomy of explainability techniques and provide a\nstructured overview of methods for explaining Transformer-based language\nmodels. We categorize techniques based on the training paradigms of LLMs:\ntraditional fine-tuning-based paradigm and prompting-based paradigm. For each\nparadigm, we summarize the goals and dominant approaches for generating local\nexplanations of individual predictions and global explanations of overall model\nknowledge. We also discuss metrics for evaluating generated explanations, and\ndiscuss how explanations can be leveraged to debug models and improve\nperformance. Lastly, we examine key challenges and emerging opportunities for\nexplanation techniques in the era of LLMs in comparison to conventional machine\nlearning models.\n","authors":["Haiyan Zhao","Hanjie Chen","Fan Yang","Ninghao Liu","Huiqi Deng","Hengyi Cai","Shuaiqiang Wang","Dawei Yin","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2309.01029v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.02816v2","updated":"2023-11-28T17:06:48Z","published":"2023-08-05T08:12:34Z","title":"PromptCARE: Prompt Copyright Protection by Watermark Injection and\n Verification","summary":" Large language models (LLMs) have witnessed a meteoric rise in popularity\namong the general public users over the past few months, facilitating diverse\ndownstream tasks with human-level accuracy and proficiency. Prompts play an\nessential role in this success, which efficiently adapt pre-trained LLMs to\ntask-specific applications by simply prepending a sequence of tokens to the\nquery texts. However, designing and selecting an optimal prompt can be both\nexpensive and demanding, leading to the emergence of Prompt-as-a-Service\nproviders who profit by providing well-designed prompts for authorized use.\nWith the growing popularity of prompts and their indispensable role in\nLLM-based services, there is an urgent need to protect the copyright of prompts\nagainst unauthorized use.\n In this paper, we propose PromptCARE, the first framework for prompt\ncopyright protection through watermark injection and verification. Prompt\nwatermarking presents unique challenges that render existing watermarking\ntechniques developed for model and dataset copyright verification ineffective.\nPromptCARE overcomes these hurdles by proposing watermark injection and\nverification schemes tailor-made for prompts and NLP characteristics. Extensive\nexperiments on six well-known benchmark datasets, using three prevalent\npre-trained LLMs (BERT, RoBERTa, and Facebook OPT-1.3b), demonstrate the\neffectiveness, harmlessness, robustness, and stealthiness of PromptCARE.\n","authors":["Hongwei Yao","Jian Lou","Kui Ren","Zhan Qin"],"pdf_url":"https://arxiv.org/pdf/2308.02816v2.pdf","comment":"To Appear in the 45th IEEE Symposium on Security and Privacy 2024,\n code is available at: https://github.com/grasses/PromptCARE"},{"id":"http://arxiv.org/abs/2311.13409v2","updated":"2023-11-28T12:12:46Z","published":"2023-11-22T14:13:27Z","title":"CompenHR: Efficient Full Compensation for High-resolution Projector","summary":" Full projector compensation is a practical task of projector-camera systems.\nIt aims to find a projector input image, named compensation image, such that\nwhen projected it cancels the geometric and photometric distortions due to the\nphysical environment and hardware. State-of-the-art methods use deep learning\nto address this problem and show promising performance for low-resolution\nsetups. However, directly applying deep learning to high-resolution setups is\nimpractical due to the long training time and high memory cost. To address this\nissue, this paper proposes a practical full compensation solution. Firstly, we\ndesign an attention-based grid refinement network to improve geometric\ncorrection quality. Secondly, we integrate a novel sampling scheme into an\nend-to-end compensation network to alleviate computation and introduce\nattention blocks to preserve key features. Finally, we construct a benchmark\ndataset for high-resolution projector full compensation. In experiments, our\nmethod demonstrates clear advantages in both efficiency and quality.\n","authors":["Yuxi Wang","Haibin Ling","Bingyao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.13409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11255v2","updated":"2023-11-28T10:24:12Z","published":"2023-11-19T06:50:52Z","title":"M$^{2}$UGen: Multi-modal Music Understanding and Generation with the\n Power of Large Language Models","summary":" The current landscape of research leveraging large language models (LLMs) is\nexperiencing a surge. Many works harness the powerful reasoning capabilities of\nthese models to comprehend various modalities, such as text, speech, images,\nvideos, etc. They also utilize LLMs to understand human intention and generate\ndesired outputs like images, videos, and music. However, research that combines\nboth understanding and generation using LLMs is still limited and in its\nnascent stage. To address this gap, we introduce a Multi-modal Music\nUnderstanding and Generation (M$^{2}$UGen) framework that integrates LLM's\nabilities to comprehend and generate music for different modalities. The\nM$^{2}$UGen framework is purpose-built to unlock creative potential from\ndiverse sources of inspiration, encompassing music, image, and video through\nthe use of pretrained MERT, ViT, and ViViT models, respectively. To enable\nmusic generation, we explore the use of AudioLDM 2 and MusicGen. Bridging\nmulti-modal understanding and music generation is accomplished through the\nintegration of the LLaMA 2 model. Furthermore, we make use of the MU-LLaMA\nmodel to generate extensive datasets that support text/image/video-to-music\ngeneration, facilitating the training of our M$^{2}$UGen framework. We conduct\na thorough evaluation of our proposed framework. The experimental results\ndemonstrate that our model achieves or surpasses the performance of the current\nstate-of-the-art models.\n","authors":["Atin Sakkeer Hussain","Shansong Liu","Chenshuo Sun","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2311.11255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14680v2","updated":"2023-11-28T10:03:24Z","published":"2023-11-01T18:25:13Z","title":"E-polis: A serious game for the gamification of sociological surveys","summary":" E-polis is a multi-platform serious game that gamifies a sociological survey\nfor studying young people's opinions regarding their ideal society. The\ngameplay is based on a user navigating through a digital city, experiencing the\nchanges inflicted, triggered by responses to social and pedagogical surveys,\nknown as \"dilemmas\". The game integrates elements of adventure, exploration,\nand simulation. Unity was the selected game engine used for the development of\nthe game, while a middleware component was also developed to gather and process\nthe users' data. At the end of each game, users are presented with a blueprint\nof the city they navigated to showcase how their choices influenced its\ndevelopment. This motivates them to reflect on their answers and validate them.\nThe game can be used to collect data on a variety of topics, such as social\njustice, and economic development, or to promote civic engagement and encourage\nyoung people to think critically about the world around them.\n","authors":["Alexandros Gazis","Eleftheria Katsiri"],"pdf_url":"https://arxiv.org/pdf/2311.14680v2.pdf","comment":"8 pages, 11 figures, Proceedings of the International Conference on\n Applied Mathematics & Computer Science (ICAMCS) 2023"},{"id":"http://arxiv.org/abs/2311.16462v1","updated":"2023-11-28T03:45:29Z","published":"2023-11-28T03:45:29Z","title":"Viewport Prediction for Volumetric Video Streaming by Exploring Video\n Saliency and Trajectory Information","summary":" Volumetric video, also known as hologram video, is a novel medium that\nportrays natural content in Virtual Reality (VR), Augmented Reality (AR), and\nMixed Reality (MR). It is expected to be the next-gen video technology and a\nprevalent use case for 5G and beyond wireless communication. Considering that\neach user typically only watches a section of the volumetric video, known as\nthe viewport, it is essential to have precise viewport prediction for optimal\nperformance. However, research on this topic is still in its infancy. In the\nend, this paper presents and proposes a novel approach, named Saliency and\nTrajectory Viewport Prediction (STVP), which aims to improve the precision of\nviewport prediction in volumetric video streaming. The STVP extensively\nutilizes video saliency information and viewport trajectory. To our knowledge,\nthis is the first comprehensive study of viewport prediction in volumetric\nvideo streaming. In particular, we introduce a novel sampling method, Uniform\nRandom Sampling (URS), to reduce computational complexity while still\npreserving video features in an efficient manner. Then we present a saliency\ndetection technique that incorporates both spatial and temporal information for\ndetecting static, dynamic geometric, and color salient regions. Finally, we\nintelligently fuse saliency and trajectory information to achieve more accurate\nviewport prediction. We conduct extensive simulations to evaluate the\neffectiveness of our proposed viewport prediction methods using\nstate-of-the-art volumetric video sequences. The experimental results show the\nsuperiority of the proposed method over existing schemes. The dataset and\nsource code will be publicly accessible after acceptance.\n","authors":["Jie Li","Zhixin Li","Zhi Liu","Pengyuan Zhou","Richang Hong","Qiyue Li","Han Hu"],"pdf_url":"https://arxiv.org/pdf/2311.16462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08006v3","updated":"2023-11-28T00:57:15Z","published":"2023-10-12T03:19:13Z","title":"MCPNS: A Macropixel Collocated Position and Its Neighbors Search for\n Plenoptic 2.0 Video Coding","summary":" Recently, it was demonstrated that a newly focused plenoptic 2.0 camera can\ncapture much higher spatial resolution owing to its effective light field\nsampling, as compared to a traditional unfocused plenoptic 1.0 camera. However,\ndue to the nature difference of the optical structure between the plenoptic 1.0\nand 2.0 cameras, the existing fast motion estimation (ME) method for plenoptic\n1.0 videos is expected to be sub-optimal for encoding plenoptic 2.0 videos. In\nthis paper, we point out the main motion characteristic differences between\nplenoptic 1.0 and 2.0 videos and then propose a new fast ME, called macropixel\ncollocated position and its neighbors search (MCPNS) for plenoptic 2.0 videos.\nIn detail, we propose to reduce the number of macropixel collocated position\n(MCP) search candidates based on the new observation of center-biased motion\nvector distribution at macropixel resolution. After that, due to large motion\ndeviation behavior around each MCP location in plenoptic 2.0 videos, we propose\nto select a certain number of key MCP locations with the lowest matching cost\nto perform the neighbors MCP search to improve the motion search accuracy.\nDifferent from existing methods, our method can achieve better performance\nwithout requiring prior knowledge of microlens array orientations. Our\nsimulation results confirmed the effectiveness of the proposed algorithm in\nterms of both bitrate savings and computational costs compared to existing\nmethods.\n","authors":["Vinh Van Duong","Thuc Nguyen Huu","Jonghoon Yim","Byeungwoo Jeon"],"pdf_url":"https://arxiv.org/pdf/2310.08006v3.pdf","comment":"Under review"}]},"2023-11-29T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.17898v1","updated":"2023-11-29T18:51:46Z","published":"2023-11-29T18:51:46Z","title":"Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis","summary":" Hallucinations and unfaithful synthesis due to inaccurate prompts with\ninsufficient semantic details are widely observed in multimodal generative\nmodels. A prevalent strategy to align multiple modalities is to fine-tune the\ngenerator with a large number of annotated text-image pairs. However, such a\nprocedure is labor-consuming and resource-draining. The key question we ask is:\ncan we enhance the quality and faithfulness of text-driven generative models\nbeyond extensive text-image pair annotations? To address this question, we\npropose Knowledge Pursuit Prompting (KPP), a zero-shot framework that\niteratively incorporates external knowledge to help generators produce reliable\nvisual content. Instead of training generators to handle generic prompts, KPP\nemploys a recursive knowledge query process to gather informative external\nfacts from the knowledge base, instructs a language model to compress the\nacquired knowledge for prompt refinement, and utilizes text-driven generators\nfor visual synthesis. The entire process is zero-shot, without accessing the\narchitectures and parameters of generative models. We evaluate the framework\nacross multiple text-driven generative tasks (image, 3D rendering, and video)\non datasets of different domains. We further demonstrate the extensibility and\nadaptability of KPP through varying foundation model bases and instructions.\nOur results show that KPP is capable of generating faithful and semantically\nrich content across diverse visual domains, offering a promising solution to\nimprove multimodal generative models.\n","authors":["Jinqi Luo","Kwan Ho Ryan Chan","Dimitris Dimos","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2311.17898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17892v1","updated":"2023-11-29T18:46:29Z","published":"2023-11-29T18:46:29Z","title":"A Pipeline For Discourse Circuits From CCG","summary":" There is a significant disconnect between linguistic theory and modern NLP\npractice, which relies heavily on inscrutable black-box architectures.\nDisCoCirc is a newly proposed model for meaning that aims to bridge this\ndivide, by providing neuro-symbolic models that incorporate linguistic\nstructure. DisCoCirc represents natural language text as a `circuit' that\ncaptures the core semantic information of the text. These circuits can then be\ninterpreted as modular machine learning models. Additionally, DisCoCirc fulfils\nanother major aim of providing an NLP model that can be implemented on\nnear-term quantum computers.\n In this paper we describe a software pipeline that converts English text to\nits DisCoCirc representation. The pipeline achieves coverage over a large\nfragment of the English language. It relies on Combinatory Categorial Grammar\n(CCG) parses of the input text as well as coreference resolution information.\nThis semantic and syntactic information is used in several steps to convert the\ntext into a simply-typed $\\lambda$-calculus term, and then into a circuit\ndiagram. This pipeline will enable the application of the DisCoCirc framework\nto NLP tasks, using both classical and quantum approaches.\n","authors":["Jonathon Liu","Razin A. Shaikh","Benjamin Rodatz","Richie Yeung","Bob Coecke"],"pdf_url":"https://arxiv.org/pdf/2311.17892v1.pdf","comment":"39 pages, many figures"},{"id":"http://arxiv.org/abs/2311.17842v1","updated":"2023-11-29T17:46:25Z","published":"2023-11-29T17:46:25Z","title":"Look Before You Leap: Unveiling the Power of GPT-4V in Robotic\n Vision-Language Planning","summary":" In this study, we are interested in imbuing robots with the capability of\nphysically-grounded task planning. Recent advancements have shown that large\nlanguage models (LLMs) possess extensive knowledge useful in robotic tasks,\nespecially in reasoning and planning. However, LLMs are constrained by their\nlack of world grounding and dependence on external affordance models to\nperceive environmental information, which cannot jointly reason with LLMs. We\nargue that a task planner should be an inherently grounded, unified multimodal\nsystem. To this end, we introduce Robotic Vision-Language Planning (ViLa), a\nnovel approach for long-horizon robotic planning that leverages vision-language\nmodels (VLMs) to generate a sequence of actionable steps. ViLa directly\nintegrates perceptual data into its reasoning and planning process, enabling a\nprofound understanding of commonsense knowledge in the visual world, including\nspatial layouts and object attributes. It also supports flexible multimodal\ngoal specification and naturally incorporates visual feedback. Our extensive\nevaluation, conducted in both real-robot and simulated environments,\ndemonstrates ViLa's superiority over existing LLM-based planners, highlighting\nits effectiveness in a wide array of open-world manipulation tasks.\n","authors":["Yingdong Hu","Fanqi Lin","Tong Zhang","Li Yi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2311.17842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17813v1","updated":"2023-11-29T17:04:15Z","published":"2023-11-29T17:04:15Z","title":"Higher-Order DisCoCat (Peirce-Lambek-Montague semantics)","summary":" We propose a new definition of higher-order DisCoCat (categorical\ncompositional distributional) models where the meaning of a word is not a\ndiagram, but a diagram-valued higher-order function. Our models can be seen as\na variant of Montague semantics based on a lambda calculus where the primitives\nact on string diagrams rather than logical formulae. As a special case, we show\nhow to translate from the Lambek calculus into Peirce's system beta for\nfirst-order logic. This allows us to give a purely diagrammatic treatment of\nhigher-order and non-linear processes in natural language semantics: adverbs,\nprepositions, negation and quantifiers. The theoretical definition presented in\nthis article comes with a proof-of-concept implementation in DisCoPy, the\nPython library for string diagrams.\n","authors":["Alexis Toumi","Giovanni de Felice"],"pdf_url":"https://arxiv.org/pdf/2311.17813v1.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.09949v3","updated":"2023-11-29T16:34:49Z","published":"2023-10-15T20:57:25Z","title":"Chameleon: a heterogeneous and disaggregated accelerator system for\n retrieval-augmented language models","summary":" A Retrieval-Augmented Language Model (RALM) augments a generative language\nmodel by retrieving context-specific knowledge from an external database. This\nstrategy facilitates impressive text generation quality even with smaller\nmodels, thus reducing orders of magnitude of computational demands. However,\nRALMs introduce unique system design challenges due to (a) the diverse workload\ncharacteristics between LM inference and retrieval and (b) the various system\nrequirements and bottlenecks for different RALM configurations such as model\nsizes, database sizes, and retrieval frequencies. We propose Chameleon, a\nheterogeneous accelerator system that integrates both LM and retrieval\naccelerators in a disaggregated architecture. The heterogeneity ensures\nefficient acceleration of both LM inference and retrieval, while the\naccelerator disaggregation enables the system to independently scale both types\nof accelerators to fulfill diverse RALM requirements. Our Chameleon prototype\nimplements retrieval accelerators on FPGAs and assigns LM inference to GPUs,\nwith a CPU server orchestrating these accelerators over the network. Compared\nto CPU-based and CPU-GPU vector search systems, Chameleon achieves up to 23.72x\nspeedup and 26.2x energy efficiency. Evaluated on various RALMs, Chameleon\nexhibits up to 2.16x reduction in latency and 3.18x speedup in throughput\ncompared to the hybrid CPU-GPU architecture. These promising results pave the\nway for bringing accelerator heterogeneity and disaggregation into future RALM\nsystems.\n","authors":["Wenqi Jiang","Marco Zeller","Roger Waleffe","Torsten Hoefler","Gustavo Alonso"],"pdf_url":"https://arxiv.org/pdf/2310.09949v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17786v1","updated":"2023-11-29T16:33:19Z","published":"2023-11-29T16:33:19Z","title":"DSS: Synthesizing long Digital Ink using Data augmentation, Style\n encoding and Split generation","summary":" As text generative models can give increasingly long answers, we tackle the\nproblem of synthesizing long text in digital ink. We show that the commonly\nused models for this task fail to generalize to long-form data and how this\nproblem can be solved by augmenting the training data, changing the model\narchitecture and the inference procedure. These methods use contrastive\nlearning technique and are tailored specifically for the handwriting domain.\nThey can be applied to any encoder-decoder model that works with digital ink.\nWe demonstrate that our method reduces the character error rate on long-form\nEnglish data by half compared to baseline RNN and by 16% compared to the\nprevious approach that aims at addressing the same problem. We show that all\nthree parts of the method improve recognizability of generated inks. In\naddition, we evaluate synthesized data in a human study and find that people\nperceive most of generated data as real.\n","authors":["Aleksandr Timofeev","Anastasiia Fadeeva","Andrei Afonin","Claudiu Musat","Andrii Maksai"],"pdf_url":"https://arxiv.org/pdf/2311.17786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13534v3","updated":"2023-11-29T16:18:38Z","published":"2023-11-22T17:14:54Z","title":"LM-Cocktail: Resilient Tuning of Language Models via Model Merging","summary":" The pre-trained language models are continually fine-tuned to better support\ndownstream applications. However, this operation may result in significant\nperformance degeneration on general tasks beyond the targeted domain. To\novercome this problem, we propose LM-Cocktail which enables the fine-tuned\nmodel to stay resilient in general perspectives. Our method is conducted in the\nform of model merging, where the fine-tuned language model is merged with the\npre-trained base model or the peer models from other domains through weighted\naverage. Despite simplicity, LM-Cocktail is surprisingly effective: the\nresulted model is able to achieve a strong empirical performance in the whole\nscope of general tasks while preserving a superior capacity in its targeted\ndomain. We conduct comprehensive experiments with LLama and BGE model on\npopular benchmarks, including FLAN, MMLU, MTEB, whose results validate the\nefficacy of our proposed method. The code and checkpoints are available at\nhttps://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Xingrun Xing"],"pdf_url":"https://arxiv.org/pdf/2311.13534v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17771v1","updated":"2023-11-29T16:11:45Z","published":"2023-11-29T16:11:45Z","title":"Supervising the Centroid Baseline for Extractive Multi-Document\n Summarization","summary":" The centroid method is a simple approach for extractive multi-document\nsummarization and many improvements to its pipeline have been proposed. We\nfurther refine it by adding a beam search process to the sentence selection and\nalso a centroid estimation attention model that leads to improved results. We\ndemonstrate this in several multi-document summarization datasets, including in\na multilingual scenario.\n","authors":["Simão Gonçalves","Gonçalo Correia","Diogo Pernes","Afonso Mendes"],"pdf_url":"https://arxiv.org/pdf/2311.17771v1.pdf","comment":"Accepted at \"The 4th New Frontiers in Summarization (with LLMs)\n Workshop\""},{"id":"http://arxiv.org/abs/2311.16989v2","updated":"2023-11-29T16:00:05Z","published":"2023-11-28T17:44:51Z","title":"ChatGPT's One-year Anniversary: Are Open-Source Large Language Models\n Catching up?","summary":" Upon its release in late 2022, ChatGPT has brought a seismic shift in the\nentire landscape of AI, both in research and commerce. Through\ninstruction-tuning a large language model (LLM) with supervised fine-tuning and\nreinforcement learning from human feedback, it showed that a model could answer\nhuman questions and follow instructions on a broad panel of tasks. Following\nthis success, interests in LLMs have intensified, with new LLMs flourishing at\nfrequent interval across academia and industry, including many start-ups\nfocused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's\nClaude) generally outperform their open-source counterparts, the progress on\nthe latter has been rapid with claims of achieving parity or even better on\ncertain tasks. This has crucial implications not only on research but also on\nbusiness. In this work, on the first anniversary of ChatGPT, we provide an\nexhaustive overview of this success, surveying all tasks where an open-source\nLLM has claimed to be on par or better than ChatGPT.\n","authors":["Hailin Chen","Fangkai Jiao","Xingxuan Li","Chengwei Qin","Mathieu Ravaut","Ruochen Zhao","Caiming Xiong","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2311.16989v2.pdf","comment":"version v2, applied several minor changes"},{"id":"http://arxiv.org/abs/2311.17041v2","updated":"2023-11-29T15:52:55Z","published":"2023-11-28T18:53:06Z","title":"Efficient In-Context Learning in Vision-Language Models for Egocentric\n Videos","summary":" Recent advancements in text-only large language models (LLMs) have\nhighlighted the benefit of in-context learning for adapting to new tasks with a\nfew demonstrations. However, extending in-context learning to large\nvision-language models (VLMs) using a huge amount of naturalistic\nvision-language data has shown limited success, particularly for egocentric\nvideos, due to high data collection costs. We propose a novel training method\n$\\mathbb{E}$fficient $\\mathbb{I}$n-context $\\mathbb{L}$earning on\n$\\mathbb{E}$gocentric $\\mathbb{V}$ideos ($\\mathbb{EILEV}$), which elicits\nin-context learning in VLMs for egocentric videos without requiring massive,\nnaturalistic egocentric video datasets. $\\mathbb{EILEV}$ involves architectural\nand training data adaptations to allow the model to process contexts\ninterleaved with video clips and narrations, sampling of in-context examples\nwith clusters of similar verbs and nouns, use of data with skewed marginal\ndistributions with a long tail of infrequent verbs and nouns, as well as\nhomonyms and synonyms. Our evaluations show that $\\mathbb{EILEV}$-trained\nmodels outperform larger VLMs trained on a huge amount of naturalistic data in\nin-context learning. Furthermore, they can generalize to not only\nout-of-distribution, but also novel, rare egocentric videos and texts via\nin-context learning, demonstrating potential for applications requiring\ncost-effective training, and rapid post-deployment adaptability. Our code and\ndemo are available at \\url{https://github.com/yukw777/EILEV}.\n","authors":["Keunwoo Peter Yu","Zheyuan Zhang","Fengyuan Hu","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2311.17041v2.pdf","comment":"10 pages, LaTeX; added acknowledgments"},{"id":"http://arxiv.org/abs/2311.17743v1","updated":"2023-11-29T15:49:24Z","published":"2023-11-29T15:49:24Z","title":"Mukhyansh: A Headline Generation Dataset for Indic Languages","summary":" The task of headline generation within the realm of Natural Language\nProcessing (NLP) holds immense significance, as it strives to distill the true\nessence of textual content into concise and attention-grabbing summaries. While\nnoteworthy progress has been made in headline generation for widely spoken\nlanguages like English, there persist numerous challenges when it comes to\ngenerating headlines in low-resource languages, such as the rich and diverse\nIndian languages. A prominent obstacle that specifically hinders headline\ngeneration in Indian languages is the scarcity of high-quality annotated data.\nTo address this crucial gap, we proudly present Mukhyansh, an extensive\nmultilingual dataset, tailored for Indian language headline generation.\nComprising an impressive collection of over 3.39 million article-headline\npairs, Mukhyansh spans across eight prominent Indian languages, namely Telugu,\nTamil, Kannada, Malayalam, Hindi, Bengali, Marathi, and Gujarati. We present a\ncomprehensive evaluation of several state-of-the-art baseline models.\nAdditionally, through an empirical analysis of existing works, we demonstrate\nthat Mukhyansh outperforms all other models, achieving an impressive average\nROUGE-L score of 31.43 across all 8 languages.\n","authors":["Lokesh Madasu","Gopichand Kanumolu","Nirmal Surange","Manish Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2311.17743v1.pdf","comment":"Accepted at PACLIC 2023"},{"id":"http://arxiv.org/abs/2311.17741v1","updated":"2023-11-29T15:44:39Z","published":"2023-11-29T15:44:39Z","title":"End-to-end Joint Rich and Normalized ASR with a limited amount of rich\n training data","summary":" Joint rich and normalized automatic speech recognition (ASR), that produces\ntranscriptions both with and without punctuation and capitalization, remains a\nchallenge. End-to-end (E2E) ASR models offer both convenience and the ability\nto perform such joint transcription of speech. Training such models requires\npaired speech and rich text data, which is not widely available. In this paper,\nwe compare two different approaches to train a stateless Transducer-based E2E\njoint rich and normalized ASR system, ready for streaming applications, with a\nlimited amount of rich labeled data. The first approach uses a language model\nto generate pseudo-rich transcriptions of normalized training data. The second\napproach uses a single decoder conditioned on the type of the output. The first\napproach leads to E2E rich ASR which perform better on out-of-domain data, with\nup to 9% relative reduction in errors. The second approach demonstrates the\nfeasibility of an E2E joint rich and normalized ASR system using as low as 5%\nrich training data with moderate (2.42% absolute) increase in errors.\n","authors":["Can Cui","Imran Ahamad Sheikh","Mostafa Sadeghi","Emmanuel Vincent"],"pdf_url":"https://arxiv.org/pdf/2311.17741v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2311.17722v1","updated":"2023-11-29T15:21:35Z","published":"2023-11-29T15:21:35Z","title":"SenTest: Evaluating Robustness of Sentence Encoders","summary":" Contrastive learning has proven to be an effective method for pre-training\nmodels using weakly labeled data in the vision domain. Sentence transformers\nare the NLP counterparts to this architecture, and have been growing in\npopularity due to their rich and effective sentence representations. Having\neffective sentence representations is paramount in multiple tasks, such as\ninformation retrieval, retrieval augmented generation (RAG), and sentence\ncomparison. Keeping in mind the deployability factor of transformers,\nevaluating the robustness of sentence transformers is of utmost importance.\nThis work focuses on evaluating the robustness of the sentence encoders. We\nemploy several adversarial attacks to evaluate its robustness. This system uses\ncharacter-level attacks in the form of random character substitution,\nword-level attacks in the form of synonym replacement, and sentence-level\nattacks in the form of intra-sentence word order shuffling. The results of the\nexperiments strongly undermine the robustness of sentence encoders. The models\nproduce significantly different predictions as well as embeddings on perturbed\ndatasets. The accuracy of the models can fall up to 15 percent on perturbed\ndatasets as compared to unperturbed datasets. Furthermore, the experiments\ndemonstrate that these embeddings does capture the semantic and syntactic\nstructure (sentence order) of sentences. However, existing supervised\nclassification strategies fail to leverage this information, and merely\nfunction as n-gram detectors.\n","authors":["Tanmay Chavan","Shantanu Patankar","Aditya Kane","Omkar Gokhale","Geetanjali Kale","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2311.17722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05703v3","updated":"2023-11-29T15:12:00Z","published":"2023-10-09T13:24:44Z","title":"An Attribution Method for Siamese Encoders","summary":" Despite the success of Siamese encoder models such as sentence transformers\n(ST), little is known about the aspects of inputs they pay attention to. A\nbarrier is that their predictions cannot be attributed to individual features,\nas they compare two inputs rather than processing a single one. This paper\nderives a local attribution method for Siamese encoders by generalizing the\nprinciple of integrated gradients to models with multiple inputs. The solution\ntakes the form of feature-pair attributions, and can be reduced to a\ntoken-token matrix for STs. Our method involves the introduction of integrated\nJacobians and inherits the advantageous formal properties of integrated\ngradients: it accounts for the model's full computation graph and is guaranteed\nto converge to the actual prediction. A pilot study shows that in an ST few\ntoken-pairs can often explain large fractions of predictions, and it focuses on\nnouns and verbs. For accurate predictions, it however needs to attend to the\nmajority of tokens and parts of speech.\n","authors":["Lucas Möller","Dmitry Nikolaev","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2310.05703v3.pdf","comment":"Accepted to EMNLP'23"},{"id":"http://arxiv.org/abs/2310.01929v2","updated":"2023-11-29T15:11:02Z","published":"2023-10-03T10:13:36Z","title":"Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of\n Text-To-Image Models","summary":" Text-To-Image (TTI) models, such as DALL-E and StableDiffusion, have\ndemonstrated remarkable prompt-based image generation capabilities.\nMultilingual encoders may have a substantial impact on the cultural agency of\nthese models, as language is a conduit of culture. In this study, we explore\nthe cultural perception embedded in TTI models by characterizing culture across\nthree hierarchical tiers: cultural dimensions, cultural domains, and cultural\nconcepts. Based on this ontology, we derive prompt templates to unlock the\ncultural knowledge in TTI models, and propose a comprehensive suite of\nevaluation techniques, including intrinsic evaluations using the CLIP space,\nextrinsic evaluations with a Visual-Question-Answer (VQA) model and human\nassessments, to evaluate the cultural content of TTI-generated images. To\nbolster our research, we introduce the CulText2I dataset, derived from four\ndiverse TTI models and spanning ten languages. Our experiments provide insights\nregarding Do, What, Which and How research questions about the nature of\ncultural encoding in TTI models, paving the way for cross-cultural applications\nof these models.\n","authors":["Mor Ventura","Eyal Ben-David","Anna Korhonen","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2310.01929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17696v1","updated":"2023-11-29T15:02:46Z","published":"2023-11-29T15:02:46Z","title":"How to Build an AI Tutor that Can Adapt to Any Course and Provide\n Accurate Answers Using Large Language Model and Retrieval-Augmented\n Generation","summary":" Artificial intelligence is transforming education through data-driven,\npersonalized learning solutions. This paper introduces AI Tutor, an innovative\nweb application that provides personalized tutoring in any subject using\nstate-of-the-art Large Language Model (LLM). AI Tutor ingests course materials\nto construct an adaptive knowledge base tailored to the course. When students\npose questions, it retrieves the most relevant information and generates\ndetailed, conversational responses citing supporting evidence. The system is\npowered by advanced large language models and Retrieval-Augmented Generation\n(RAG) techniques for accurate, natural question answering. We present a\nfully-functional web interface and video demonstration that showcase AI Tutor's\nversatility across diverse subjects and its ability to produce pedagogically\ncogent responses. While an initial prototype, this work represents a pioneering\nstep toward AI-enabled tutoring systems that can democratize access to\nhigh-quality, customized educational support.\n","authors":["Chenxi Dong"],"pdf_url":"https://arxiv.org/pdf/2311.17696v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.17686v1","updated":"2023-11-29T14:49:31Z","published":"2023-11-29T14:49:31Z","title":"AviationGPT: A Large Language Model for the Aviation Domain","summary":" The advent of ChatGPT and GPT-4 has captivated the world with large language\nmodels (LLMs), demonstrating exceptional performance in question-answering,\nsummarization, and content generation. The aviation industry is characterized\nby an abundance of complex, unstructured text data, replete with technical\njargon and specialized terminology. Moreover, labeled data for model building\nare scarce in this domain, resulting in low usage of aviation text data. The\nemergence of LLMs presents an opportunity to transform this situation, but\nthere is a lack of LLMs specifically designed for the aviation domain. To\naddress this gap, we propose AviationGPT, which is built on open-source LLaMA-2\nand Mistral architectures and continuously trained on a wealth of carefully\ncurated aviation datasets. Experimental results reveal that AviationGPT offers\nusers multiple advantages, including the versatility to tackle diverse natural\nlanguage processing (NLP) problems (e.g., question-answering, summarization,\ndocument writing, information extraction, report querying, data cleaning, and\ninteractive data exploration). It also provides accurate and contextually\nrelevant responses within the aviation domain and significantly improves\nperformance (e.g., over a 40% performance gain in tested cases). With\nAviationGPT, the aviation industry is better equipped to address more complex\nresearch problems and enhance the efficiency and safety of National Airspace\nSystem (NAS) operations.\n","authors":["Liya Wang","Jason Chou","Xin Zhou","Alex Tien","Diane M Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2311.17686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05199v5","updated":"2023-11-29T14:45:53Z","published":"2023-10-08T15:14:39Z","title":"Loose lips sink ships: Mitigating Length Bias in Reinforcement Learning\n from Human Feedback","summary":" Reinforcement learning from human feedback serves as a crucial bridge,\naligning large language models with human and societal values. This alignment\nrequires a vast corpus of human feedback to learn a reward model, which is\nsubsequently used to finetune language models. However, we have identified that\nthe reward model often finds shortcuts to bypass its intended objectives,\nmisleadingly assuming that humans prefer longer responses. The emergence of\nlength bias often induces the model to favor longer outputs, yet it doesn't\nequate to an increase in helpful information within these outputs. In this\npaper, we propose an innovative solution, applying the Product-of-Experts (PoE)\ntechnique to separate reward modeling from the influence of sequence length. In\nour framework, the main expert concentrates on understanding human intents,\nwhile the biased expert targets the identification and capture of length bias.\nTo further enhance the learning of bias, we introduce perturbations into the\nbias-focused expert, disrupting the flow of semantic information. Experimental\nresults validate the effectiveness of our approach, indicating that language\nmodel performance is improved, irrespective of sequence length.\n","authors":["Wei Shen","Rui Zheng","Wenyu Zhan","Jun Zhao","Shihan Dou","Tao Gui","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.05199v5.pdf","comment":"EMNLP 2023 findings, Length Bias in RLHF, Mitigate bias in reward\n modeling"},{"id":"http://arxiv.org/abs/2305.09556v2","updated":"2023-11-29T14:45:46Z","published":"2023-05-16T15:53:24Z","title":"Adapting Sentence Transformers for the Aviation Domain","summary":" Learning effective sentence representations is crucial for many Natural\nLanguage Processing (NLP) tasks, including semantic search, semantic textual\nsimilarity (STS), and clustering. While multiple transformer models have been\ndeveloped for sentence embedding learning, these models may not perform\noptimally when dealing with specialized domains like aviation, which has unique\ncharacteristics such as technical jargon, abbreviations, and unconventional\ngrammar. Furthermore, the absence of labeled datasets makes it difficult to\ntrain models specifically for the aviation domain. To address these challenges,\nwe propose a novel approach for adapting sentence transformers for the aviation\ndomain. Our method is a two-stage process consisting of pre-training followed\nby fine-tuning. During pre-training, we use Transformers and Sequential\nDenoising AutoEncoder (TSDAE) with aviation text data as input to improve the\ninitial model performance. Subsequently, we fine-tune our models using a\nNatural Language Inference (NLI) dataset in the Sentence Bidirectional Encoder\nRepresentations from Transformers (SBERT) architecture to mitigate overfitting\nissues. Experimental results on several downstream tasks show that our adapted\nsentence transformers significantly outperform general-purpose transformers,\ndemonstrating the effectiveness of our approach in capturing the nuances of the\naviation domain. Overall, our work highlights the importance of domain-specific\nadaptation in developing high-quality NLP solutions for specialized industries\nlike aviation.\n","authors":["Liya Wang","Jason Chou","Dave Rouck","Alex Tien","Diane M Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2305.09556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17676v1","updated":"2023-11-29T14:39:38Z","published":"2023-11-29T14:39:38Z","title":"Improving Minority Stress Detection with Emotions","summary":" Psychological stress detection is an important task for mental healthcare\nresearch, but there has been little prior work investigating the effectiveness\nof psychological stress models on minority individuals, who are especially\nvulnerable to poor mental health outcomes. In this work, we use the related\ntask of minority stress detection to evaluate the ability of psychological\nstress models to understand the language of sexual and gender minorities. We\nfind that traditional psychological stress models underperform on minority\nstress detection, and we propose using emotion-infused models to reduce that\nperformance disparity. We further demonstrate that multi-task psychological\nstress models outperform the current state-of-the-art for minority stress\ndetection without directly training on minority stress data. We provide\nexplanatory analysis showing that minority communities have different\ndistributions of emotions than the general population and that emotion-infused\nmodels improve the performance of stress models on underrepresented groups\nbecause of their effectiveness in low-data environments, and we propose that\nintegrating emotions may benefit underrepresented groups in other mental health\ndetection tasks.\n","authors":["Jonathan Ivey","Susan Gauch"],"pdf_url":"https://arxiv.org/pdf/2311.17676v1.pdf","comment":"9 pages, 6 figures, under review"},{"id":"http://arxiv.org/abs/2311.17667v1","updated":"2023-11-29T14:30:16Z","published":"2023-11-29T14:30:16Z","title":"TimeBench: A Comprehensive Evaluation of Temporal Reasoning Abilities in\n Large Language Models","summary":" Understanding time is a pivotal aspect of human cognition, crucial in the\nbroader framework of grasping the intricacies of the world. Previous studies\ntypically focus on specific aspects of time, lacking a comprehensive temporal\nreasoning benchmark. To address this issue, we propose TimeBench, a\ncomprehensive hierarchical temporal reasoning benchmark that covers a broad\nspectrum of temporal reasoning phenomena, which provides a thorough evaluation\nfor investigating the temporal reasoning capabilities of large language models.\nWe conduct extensive experiments on popular LLMs, such as GPT-4, LLaMA2, and\nMistral, incorporating chain-of-thought prompting. Our experimental results\nindicate a significant performance gap between the state-of-the-art LLMs and\nhumans, highlighting that there is still a considerable distance to cover in\ntemporal reasoning. We aspire for TimeBench to serve as a comprehensive\nbenchmark, fostering research in temporal reasoning for LLMs. Our resource is\navailable at https://github.com/zchuz/TimeBench\n","authors":["Zheng Chu","Jingchang Chen","Qianglong Chen","Weijiang Yu","Haotian Wang","Ming Liu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2311.17667v1.pdf","comment":"Resources at: https://github.com/zchuz/TimeBench"},{"id":"http://arxiv.org/abs/2311.17647v1","updated":"2023-11-29T14:08:53Z","published":"2023-11-29T14:08:53Z","title":"VIM: Probing Multimodal Large Language Models for Visual Embedded\n Instruction Following","summary":" We introduce VISUAL EMBEDDED INSTRUCTION (VIM), a new framework designed to\nevaluate the visual instruction following capability of Multimodal Large\nLanguage Models (MLLMs). As illustrated in Figure 2, VIM challenges the MLLMs\nby embedding the instructions into the visual scenes, demanding strong visual\ninterpretative skills for instruction following. We adapt VIM to various\nbenchmarks, including VQAv2, MME, MM-Vet, and RefCOCO series, compose a VIM\nbench, and probe diverse MLLMs across three distinct in-context learning\nsettings: Zero Shot, One Shot, and Pair Shot. We observe that there is a\nsignificant performance disparity between the open-source MLLMs and GPT-4V,\nimplying that their proficiency in visual instruction comprehension is not up\nto par. Our results highlight a promising direction for the enhancement of\nMLLMs capabilities on instruction following. We aim VIM to serve as a useful\nnorm for advancing the state of the art and driving further progress in the\nfield.\n","authors":["Yujie Lu","Xiujun Li","William Yang Wang","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2311.17647v1.pdf","comment":"20 pages, 8 figures, 20 tables"},{"id":"http://arxiv.org/abs/2305.04118v3","updated":"2023-11-29T13:52:23Z","published":"2023-05-06T19:03:12Z","title":"Exploring Human-Like Translation Strategy with Large Language Models","summary":" Large language models (LLMs) have demonstrated impressive capabilities in\ngeneral scenarios, exhibiting a level of aptitude that approaches, in some\naspects even surpasses, human-level intelligence. Among their numerous skills,\nthe translation abilities of LLMs have received considerable attention.\nCompared to typical machine translation that focuses solely on source-to-target\nmapping, LLM-based translation can potentially mimic the human translation\nprocess which might take preparatory steps to ensure high-quality translation.\nThis work explores this possibility by proposing the MAPS framework, which\nstands for Multi-Aspect Prompting and Selection. Specifically, we enable LLMs\nfirst to analyze the given source sentence and induce three aspects of\ntranslation-related knowledge: keywords, topics, and relevant demonstrations to\nguide the final translation process. Moreover, we employ a selection mechanism\nbased on quality estimation to filter out noisy and unhelpful knowledge. Both\nautomatic (3 LLMs x 11 directions x 2 automatic metrics) and human evaluation\n(preference study and MQM) demonstrate the effectiveness of MAPS. Further\nanalysis shows that by mimicking the human translation process, MAPS reduces\nvarious translation errors such as hallucination, ambiguity, mistranslation,\nawkward style, untranslated text, and omission. Source code is available at\nhttps://github.com/zwhe99/MAPS-mt.\n","authors":["Zhiwei He","Tian Liang","Wenxiang Jiao","Zhuosheng Zhang","Yujiu Yang","Rui Wang","Zhaopeng Tu","Shuming Shi","Xing Wang"],"pdf_url":"https://arxiv.org/pdf/2305.04118v3.pdf","comment":"To be published in TACL (pre-MIT Press publication version)"},{"id":"http://arxiv.org/abs/2311.17633v1","updated":"2023-11-29T13:51:04Z","published":"2023-11-29T13:51:04Z","title":"Introduction to Transformers: an NLP Perspective","summary":" Transformers have dominated empirical machine learning models of natural\nlanguage processing. In this paper, we introduce basic concepts of Transformers\nand present key techniques that form the recent advances of these models. This\nincludes a description of the standard Transformer architecture, a series of\nmodel refinements, and common applications. Given that Transformers and related\ndeep learning techniques might be evolving in ways we have never seen, we\ncannot dive into all the model details or cover all the technical areas.\nInstead, we focus on just those concepts that are helpful for gaining a good\nunderstanding of Transformers and their variants. We also summarize the key\nideas that impact this field, thereby yielding some insights into the strengths\nand limitations of these models.\n","authors":["Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.17633v1.pdf","comment":"119 pages and 21 figures"},{"id":"http://arxiv.org/abs/2311.17593v1","updated":"2023-11-29T12:41:55Z","published":"2023-11-29T12:41:55Z","title":"LanGWM: Language Grounded World Model","summary":" Recent advances in deep reinforcement learning have showcased its potential\nin tackling complex tasks. However, experiments on visual control tasks have\nrevealed that state-of-the-art reinforcement learning models struggle with\nout-of-distribution generalization. Conversely, expressing higher-level\nconcepts and global contexts is relatively easy using language.\n Building upon recent success of the large language models, our main objective\nis to improve the state abstraction technique in reinforcement learning by\nleveraging language for robust action selection. Specifically, we focus on\nlearning language-grounded visual features to enhance the world model learning,\na model-based reinforcement learning technique.\n To enforce our hypothesis explicitly, we mask out the bounding boxes of a few\nobjects in the image observation and provide the text prompt as descriptions\nfor these masked objects. Subsequently, we predict the masked objects along\nwith the surrounding regions as pixel reconstruction, similar to the\ntransformer-based masked autoencoder approach.\n Our proposed LanGWM: Language Grounded World Model achieves state-of-the-art\nperformance in out-of-distribution test at the 100K interaction steps\nbenchmarks of iGibson point navigation tasks. Furthermore, our proposed\ntechnique of explicit language-grounded visual representation learning has the\npotential to improve models for human-robot interaction because our extracted\nvisual features are language grounded.\n","authors":["Rudra P. K. Poudel","Harit Pandya","Chao Zhang","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2311.17593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13731v2","updated":"2023-11-29T11:52:58Z","published":"2023-09-24T19:26:53Z","title":"Arabic Sentiment Analysis with Noisy Deep Explainable Model","summary":" Sentiment Analysis (SA) is an indispensable task for many real-world\napplications. Compared to limited resourced languages (i.e., Arabic, Bengali),\nmost of the research on SA are conducted for high resourced languages (i.e.,\nEnglish, Chinese). Moreover, the reasons behind any prediction of the Arabic\nsentiment analysis methods exploiting advanced artificial intelligence\n(AI)-based approaches are like black-box - quite difficult to understand. This\npaper proposes an explainable sentiment classification framework for the Arabic\nlanguage by introducing a noise layer on Bi-Directional Long Short-Term Memory\n(BiLSTM) and Convolutional Neural Networks (CNN)-BiLSTM models that overcome\nover-fitting problem. The proposed framework can explain specific predictions\nby training a local surrogate explainable model to understand why a particular\nsentiment (positive or negative) is being predicted. We carried out experiments\non public benchmark Arabic SA datasets. The results concluded that adding noise\nlayers improves the performance in sentiment analysis for the Arabic language\nby reducing overfitting and our method outperformed some known state-of-the-art\nmethods. In addition, the introduced explainability with noise layer could make\nthe model more transparent and accountable and hence help adopting AI-enabled\nsystem in practice.\n","authors":["Md. Atabuzzaman","Md Shajalal","Maksuda Bilkis Baby","Alexander Boden"],"pdf_url":"https://arxiv.org/pdf/2309.13731v2.pdf","comment":"This is the pre-print version of our accepted paper at the 7th\n International Conference on Natural Language Processing and Information\n Retrieval~(ACM NLPIR'2023)"},{"id":"http://arxiv.org/abs/2305.14991v2","updated":"2023-11-29T10:47:58Z","published":"2023-05-24T10:26:13Z","title":"MuLER: Detailed and Scalable Reference-based Evaluation","summary":" We propose a novel methodology (namely, MuLER) that transforms any\nreference-based evaluation metric for text generation, such as machine\ntranslation (MT) into a fine-grained analysis tool. Given a system and a\nmetric, MuLER quantifies how much the chosen metric penalizes specific error\ntypes (e.g., errors in translating names of locations). MuLER thus enables a\ndetailed error analysis which can lead to targeted improvement efforts for\nspecific phenomena. We perform experiments in both synthetic and naturalistic\nsettings to support MuLER's validity and showcase its usability in MT\nevaluation, and other tasks, such as summarization. Analyzing all submissions\nto WMT in 2014-2020, we find consistent trends. For example, nouns and verbs\nare among the most frequent POS tags. However, they are among the hardest to\ntranslate. Performance on most POS tags improves with overall system\nperformance, but a few are not thus correlated (their identity changes from\nlanguage to language). Preliminary experiments with summarization reveal\nsimilar trends.\n","authors":["Taelin Karidi","Leshem Choshen","Gal Patel","Omri Abend"],"pdf_url":"https://arxiv.org/pdf/2305.14991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10642v3","updated":"2023-11-29T10:41:36Z","published":"2023-11-17T16:58:52Z","title":"Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as\n an Alternative to Attention Layers in Transformers","summary":" This work presents an analysis of the effectiveness of using standard shallow\nfeed-forward networks to mimic the behavior of the attention mechanism in the\noriginal Transformer model, a state-of-the-art architecture for\nsequence-to-sequence tasks. We substitute key elements of the attention\nmechanism in the Transformer with simple feed-forward networks, trained using\nthe original components via knowledge distillation. Our experiments, conducted\non the IWSLT2017 dataset, reveal the capacity of these \"attentionless\nTransformers\" to rival the performance of the original architecture. Through\nrigorous ablation studies, and experimenting with various replacement network\ntypes and sizes, we offer insights that support the viability of our approach.\nThis not only sheds light on the adaptability of shallow feed-forward networks\nin emulating attention mechanisms but also underscores their potential to\nstreamline complex architectures for sequence-to-sequence tasks.\n","authors":["Vukasin Bozic","Danilo Dordevic","Daniele Coppola","Joseph Thommes","Sidak Pal Singh"],"pdf_url":"https://arxiv.org/pdf/2311.10642v3.pdf","comment":"Accepted at AAAI24(https://aaai.org/aaai-conference/)"},{"id":"http://arxiv.org/abs/2311.17514v1","updated":"2023-11-29T10:38:16Z","published":"2023-11-29T10:38:16Z","title":"Reinforcement Replaces Supervision: Query focused Summarization using\n Deep Reinforcement Learning","summary":" Query-focused Summarization (QfS) deals with systems that generate summaries\nfrom document(s) based on a query. Motivated by the insight that Reinforcement\nLearning (RL) provides a generalization to Supervised Learning (SL) for Natural\nLanguage Generation, and thereby performs better (empirically) than SL, we use\nan RL-based approach for this task of QfS. Additionally, we also resolve the\nconflict of employing RL in Transformers with Teacher Forcing. We develop\nmultiple Policy Gradient networks, trained on various reward signals: ROUGE,\nBLEU, and Semantic Similarity, which lead to a 10-point improvement over the\nState-of-the-Art approach on the ROUGE-L metric for a benchmark dataset (ELI5).\nWe also show performance of our approach in zero-shot setting for another\nbenchmark dataset (DebatePedia) -- our approach leads to results comparable to\nbaselines, which were specifically trained on DebatePedia. To aid the RL\ntraining, we propose a better semantic similarity reward, enabled by a novel\nPassage Embedding scheme developed using Cluster Hypothesis. Lastly, we\ncontribute a gold-standard test dataset to further research in QfS and\nLong-form Question Answering (LfQA).\n","authors":["Swaroop Nath","Harshad Khadilkar","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2311.17514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18023v2","updated":"2023-11-29T10:33:26Z","published":"2023-10-27T09:59:24Z","title":"SentMix-3L: A Bangla-English-Hindi Code-Mixed Dataset for Sentiment\n Analysis","summary":" Code-mixing is a well-studied linguistic phenomenon when two or more\nlanguages are mixed in text or speech. Several datasets have been build with\nthe goal of training computational models for code-mixing. Although it is very\ncommon to observe code-mixing with multiple languages, most datasets available\ncontain code-mixed between only two languages. In this paper, we introduce\nSentMix-3L, a novel dataset for sentiment analysis containing code-mixed data\nbetween three languages Bangla, English, and Hindi. We carry out a\ncomprehensive evaluation using SentMix-3L. We show that zero-shot prompting\nwith GPT-3.5 outperforms all transformer-based models on SentMix-3L.\n","authors":["Md Nishat Raihan","Dhiman Goswami","Antara Mahmud","Antonios Anastasopoulos","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2310.18023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17502v1","updated":"2023-11-29T10:24:50Z","published":"2023-11-29T10:24:50Z","title":"Enhancing Answer Selection in Community Question Answering with\n Pre-trained and Large Language Models","summary":" Community Question Answering (CQA) becomes increasingly prevalent in recent\nyears. However, there are a large number of answers, which is difficult for\nusers to select the relevant answers. Therefore, answer selection is a very\nsignificant subtask of CQA. In this paper, we first propose the Question-Answer\ncross attention networks (QAN) with pre-trained models for answer selection and\nutilize large language model (LLM) to perform answer selection with knowledge\naugmentation. Specifically, we apply the BERT model as the encoder layer to do\npre-training for question subjects, question bodies and answers, respectively,\nthen the cross attention mechanism selects the most relevant answer for\ndifferent questions. Experiments show that the QAN model achieves\nstate-of-the-art performance on two datasets, SemEval2015 and SemEval2017.\nMoreover, we use the LLM to generate external knowledge from questions and\ncorrect answers to achieve knowledge augmentation for the answer selection task\nby LLM, while optimizing the prompt of LLM in different aspects. The results\nshow that the introduction of external knowledge can improve the correct answer\nselection rate of LLM on datasets SemEval2015 and SemEval2017. Meanwhile, LLM\ncan also select the correct answer on more questions by optimized prompt.\n","authors":["Xinghang Hu"],"pdf_url":"https://arxiv.org/pdf/2311.17502v1.pdf","comment":"24pages, 4 figures, 14tables"},{"id":"http://arxiv.org/abs/2311.17492v1","updated":"2023-11-29T10:01:48Z","published":"2023-11-29T10:01:48Z","title":"Mergen: The First Manchu-Korean Machine Translation Model Trained on\n Augmented Data","summary":" The Manchu language, with its roots in the historical Manchurian region of\nNortheast China, is now facing a critical threat of extinction, as there are\nvery few speakers left. In our efforts to safeguard the Manchu language, we\nintroduce Mergen, the first-ever attempt at a Manchu-Korean Machine Translation\n(MT) model. To develop this model, we utilize valuable resources such as the\nManwen Laodang(a historical book) and a Manchu-Korean dictionary. Due to the\nscarcity of a Manchu-Korean parallel dataset, we expand our data by employing\nword replacement guided by GloVe embeddings, trained on both monolingual and\nparallel texts. Our approach is built around an encoder-decoder neural machine\ntranslation model, incorporating a bi-directional Gated Recurrent Unit (GRU)\nlayer. The experiments have yielded promising results, showcasing a significant\nenhancement in Manchu-Korean translation, with a remarkable 20-30 point\nincrease in the BLEU score.\n","authors":["Jean Seo","Sungjoo Byun","Minha Kang","Sangah Lee"],"pdf_url":"https://arxiv.org/pdf/2311.17492v1.pdf","comment":"emnlp2023/mrl2023"},{"id":"http://arxiv.org/abs/2311.17487v1","updated":"2023-11-29T09:48:34Z","published":"2023-11-29T09:48:34Z","title":"Taiwan LLM: Bridging the Linguistic Divide with a Culturally Aligned\n Language Model","summary":" In the realm of language models, the nuanced linguistic and cultural\nintricacies of Traditional Chinese, as spoken in Taiwan, have been largely\noverlooked. This paper introduces Taiwan LLM, a pioneering Large Language Model\nthat specifically caters to the Traditional Chinese language, with a focus on\nthe variant used in Taiwan. Leveraging a comprehensive pretraining corpus and\ninstruction-finetuning datasets, we have developed a model that not only\nunderstands the complexities of Traditional Chinese but also embodies the\ncultural context of Taiwan. Taiwan LLM represents the first of its kind, a\nmodel that is not only linguistically accurate but also culturally resonant\nwith its user base. Our evaluations demonstrate that Taiwan LLM achieves\nsuperior performance in understanding and generating Traditional Chinese text,\noutperforming existing models that are predominantly trained on Simplified\nChinese or English. The open-source release of Taiwan LLM invites collaboration\nand further innovation, ensuring that the linguistic diversity of Chinese\nspeakers is embraced and well-served. The model, datasets, and further\nresources are made publicly available to foster ongoing research and\ndevelopment in this field.\n","authors":["Yen-Ting Lin","Yun-Nung Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10240v2","updated":"2023-11-29T08:30:34Z","published":"2022-12-20T13:36:25Z","title":"Diffusion Glancing Transformer for Parallel Sequence to Sequence\n Learning","summary":" Previously, non-autoregressive models were widely perceived as being superior\nin generation efficiency but inferior in generation quality due to the\ndifficulties of modeling multiple target modalities. To enhance the\nmulti-modality modeling ability, we propose the diffusion glancing transformer,\nwhich employs a modality diffusion process and residual glancing sampling. The\nmodality diffusion process is a discrete process that interpolates the\nmulti-modal distribution along the decoding steps, and the residual glancing\nsampling approach guides the model to continuously learn the remaining\nmodalities across the layers. Experimental results on various machine\ntranslation and text generation benchmarks demonstrate that DIFFGLAT achieves\nbetter generation accuracy while maintaining fast decoding speed compared with\nboth autoregressive and non-autoregressive models.\n","authors":["Lihua Qian","Mingxuan Wang","Yang Liu","Hao Zhou"],"pdf_url":"https://arxiv.org/pdf/2212.10240v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.17438v1","updated":"2023-11-29T08:29:54Z","published":"2023-11-29T08:29:54Z","title":"CLOMO: Counterfactual Logical Modification with Large Language Models","summary":" In this study, we delve into the realm of counterfactual reasoning\ncapabilities of large language models (LLMs). Our primary objective is to\ncultivate the counterfactual thought processes within LLMs and rigorously\nassess these processes for their validity. Specifically, we introduce a novel\ntask, Counterfactual Logical Modification (CLOMO), and a high-quality\nhuman-annotated benchmark. In this task, LLMs must adeptly alter a given\nargumentative text to uphold a predetermined logical relationship. To\neffectively evaluate a generation model's counterfactual capabilities, we\npropose an innovative evaluation metric, the LogicAware Counterfactual Score to\ndirectly evaluate the natural language output of LLMs instead of modeling the\ntask as a multiple-choice problem. Analysis shows that the proposed automatic\nmetric aligns well with human preference. Our experimental results show that\nwhile LLMs demonstrate a notable capacity for logical counterfactual thinking,\nthere remains a discernible gap between their current abilities and human\nperformance.\n","authors":["Yinya Huang","Ruixin Hong","Hongming Zhang","Wei Shao","Zhicheng Yang","Dong Yu","Changshui Zhang","Xiaodan Liang","Linqi Song"],"pdf_url":"https://arxiv.org/pdf/2311.17438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05189v4","updated":"2023-11-29T08:18:14Z","published":"2023-05-09T05:48:38Z","title":"SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with\n Large Language Models","summary":" Diffusion models, which have emerged to become popular text-to-image\ngeneration models, can produce high-quality and content-rich images guided by\ntextual prompts. However, there are limitations to semantic understanding and\ncommonsense reasoning in existing models when the input prompts are concise\nnarrative, resulting in low-quality image generation. To improve the capacities\nfor narrative prompts, we propose a simple-yet-effective parameter-efficient\nfine-tuning approach called the Semantic Understanding and Reasoning adapter\n(SUR-adapter) for pre-trained diffusion models. To reach this goal, we first\ncollect and annotate a new dataset SURD which consists of more than 57,000\nsemantically corrected multi-modal samples. Each sample contains a simple\nnarrative prompt, a complex keyword-based prompt, and a high-quality image.\nThen, we align the semantic representation of narrative prompts to the complex\nprompts and transfer knowledge of large language models (LLMs) to our\nSUR-adapter via knowledge distillation so that it can acquire the powerful\nsemantic understanding and reasoning capabilities to build a high-quality\ntextual semantic representation for text-to-image generation. We conduct\nexperiments by integrating multiple LLMs and popular pre-trained diffusion\nmodels to show the effectiveness of our approach in enabling diffusion models\nto understand and reason concise natural language without image quality\ndegradation. Our approach can make text-to-image diffusion models easier to use\nwith better user experience, which demonstrates our approach has the potential\nfor further advancing the development of user-friendly text-to-image generation\nmodels by bridging the semantic gap between simple narrative prompts and\ncomplex keyword-based prompts. The code is released at\nhttps://github.com/Qrange-group/SUR-adapter.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Wushao Wen","Jinghui Qin","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.05189v4.pdf","comment":"accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2311.17429v1","updated":"2023-11-29T08:12:09Z","published":"2023-11-29T08:12:09Z","title":"TARGET: Template-Transferable Backdoor Attack Against Prompt-based NLP\n Models via GPT4","summary":" Prompt-based learning has been widely applied in many low-resource NLP tasks\nsuch as few-shot scenarios. However, this paradigm has been shown to be\nvulnerable to backdoor attacks. Most of the existing attack methods focus on\ninserting manually predefined templates as triggers in the pre-training phase\nto train the victim model and utilize the same triggers in the downstream task\nto perform inference, which tends to ignore the transferability and\nstealthiness of the templates. In this work, we propose a novel approach of\nTARGET (Template-trAnsfeRable backdoor attack aGainst prompt-basEd NLP models\nvia GPT4), which is a data-independent attack method. Specifically, we first\nutilize GPT4 to reformulate manual templates to generate tone-strong and normal\ntemplates, and the former are injected into the model as a backdoor trigger in\nthe pre-training phase. Then, we not only directly employ the above templates\nin the downstream task, but also use GPT4 to generate templates with similar\ntone to the above templates to carry out transferable attacks. Finally we have\nconducted extensive experiments on five NLP datasets and three BERT series\nmodels, with experimental results justifying that our TARGET method has better\nattack performance and stealthiness compared to the two-external baseline\nmethods on direct attacks, and in addition achieves satisfactory attack\ncapability in the unseen tone-similar templates.\n","authors":["Zihao Tan","Qingliang Chen","Yongjian Huang","Chen Liang"],"pdf_url":"https://arxiv.org/pdf/2311.17429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.01502v2","updated":"2023-11-29T07:54:21Z","published":"2020-10-04T08:00:19Z","title":"Multi-turn Response Selection using Dialogue Dependency Relations","summary":" Multi-turn response selection is a task designed for developing dialogue\nagents. The performance on this task has a remarkable improvement with\npre-trained language models. However, these models simply concatenate the turns\nin dialogue history as the input and largely ignore the dependencies between\nthe turns. In this paper, we propose a dialogue extraction algorithm to\ntransform a dialogue history into threads based on their dependency relations.\nEach thread can be regarded as a self-contained sub-dialogue. We also propose\nThread-Encoder model to encode threads and candidates into compact\nrepresentations by pre-trained Transformers and finally get the matching score\nthrough an attention layer. The experiments show that dependency relations are\nhelpful for dialogue context understanding, and our model outperforms the\nstate-of-the-art baselines on both DSTC7 and DSTC8*, with competitive results\non UbuntuV2.\n","authors":["Qi Jia","Yizhu Liu","Siyu Ren","Kenny Q. Zhu","Haifeng Tang"],"pdf_url":"https://arxiv.org/pdf/2010.01502v2.pdf","comment":"Accepted for publication as a long paper in EMNLP2020"},{"id":"http://arxiv.org/abs/2309.14327v3","updated":"2023-11-29T07:52:18Z","published":"2023-09-25T17:53:29Z","title":"DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via\n Multi-Modal Causal Attention","summary":" Most of the existing multi-modal models, hindered by their incapacity to\nadeptly manage interleaved image-and-text inputs in multi-image, multi-round\ndialogues, face substantial constraints in resource allocation for training and\ndata accessibility, impacting their adaptability and scalability across varied\ninteraction realms. To address this, we present the DeepSpeed-VisualChat\nframework, designed to optimize Large Language Models (LLMs) by incorporating\nmulti-modal capabilities, with a focus on enhancing the proficiency of Large\nVision and Language Models in handling interleaved inputs. Our framework is\nnotable for (1) its open-source support for multi-round and multi-image\ndialogues, (2) introducing an innovative multi-modal causal attention\nmechanism, and (3) utilizing data blending techniques on existing datasets to\nassure seamless interactions in multi-round, multi-image conversations.\nCompared to existing frameworks, DeepSpeed-VisualChat shows superior\nscalability up to 70B parameter language model size, representing a significant\nadvancement in multi-modal language models and setting a solid foundation for\nfuture explorations.\n","authors":["Zhewei Yao","Xiaoxia Wu","Conglong Li","Minjia Zhang","Heyang Qin","Olatunji Ruwase","Ammar Ahmad Awan","Samyam Rajbhandari","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2309.14327v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17404v1","updated":"2023-11-29T07:15:34Z","published":"2023-11-29T07:15:34Z","title":"VITATECS: A Diagnostic Dataset for Temporal Concept Understanding of\n Video-Language Models","summary":" The ability to perceive how objects change over time is a crucial ingredient\nin human intelligence. However, current benchmarks cannot faithfully reflect\nthe temporal understanding abilities of video-language models (VidLMs) due to\nthe existence of static visual shortcuts. To remedy this issue, we present\nVITATECS, a diagnostic VIdeo-Text dAtaset for the evaluation of TEmporal\nConcept underStanding. Specifically, we first introduce a fine-grained taxonomy\nof temporal concepts in natural language in order to diagnose the capability of\nVidLMs to comprehend different temporal aspects. Furthermore, to disentangle\nthe correlation between static and temporal information, we generate\ncounterfactual video descriptions that differ from the original one only in the\nspecified temporal aspect. We employ a semi-automatic data collection framework\nusing large language models and human-in-the-loop annotation to obtain\nhigh-quality counterfactual descriptions efficiently. Evaluation of\nrepresentative video-language understanding models confirms their deficiency in\ntemporal understanding, revealing the need for greater emphasis on the temporal\nelements in video-language research.\n","authors":["Shicheng Li","Lei Li","Shuhuai Ren","Yuanxin Liu","Yi Liu","Rundong Gao","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2311.17404v1.pdf","comment":"23 pages, 6 figures, 18 tables, data is available at\n https://github.com/lscpku/VITATECS"},{"id":"http://arxiv.org/abs/2311.17400v1","updated":"2023-11-29T07:09:13Z","published":"2023-11-29T07:09:13Z","title":"Improving the Robustness of Transformer-based Large Language Models with\n Dynamic Attention","summary":" Transformer-based models, such as BERT and GPT, have been widely adopted in\nnatural language processing (NLP) due to their exceptional performance.\nHowever, recent studies show their vulnerability to textual adversarial attacks\nwhere the model's output can be misled by intentionally manipulating the text\ninputs. Despite various methods that have been proposed to enhance the model's\nrobustness and mitigate this vulnerability, many require heavy consumption\nresources (e.g., adversarial training) or only provide limited protection\n(e.g., defensive dropout). In this paper, we propose a novel method called\ndynamic attention, tailored for the transformer architecture, to enhance the\ninherent robustness of the model itself against various adversarial attacks.\nOur method requires no downstream task knowledge and does not incur additional\ncosts. The proposed dynamic attention consists of two modules: (I) attention\nrectification, which masks or weakens the attention value of the chosen tokens,\nand (ii) dynamic modeling, which dynamically builds the set of candidate\ntokens. Extensive experiments demonstrate that dynamic attention significantly\nmitigates the impact of adversarial attacks, improving up to 33\\% better\nperformance than previous methods against widely-used adversarial attacks. The\nmodel-level design of dynamic attention enables it to be easily combined with\nother defense methods (e.g., adversarial training) to further enhance the\nmodel's robustness. Furthermore, we demonstrate that dynamic attention\npreserves the state-of-the-art robustness space of the original model compared\nto other dynamic modeling methods.\n","authors":["Lujia Shen","Yuwen Pu","Shouling Ji","Changjiang Li","Xuhong Zhang","Chunpeng Ge","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17391v1","updated":"2023-11-29T06:42:36Z","published":"2023-11-29T06:42:36Z","title":"Unveiling the Implicit Toxicity in Large Language Models","summary":" The open-endedness of large language models (LLMs) combined with their\nimpressive capabilities may lead to new safety issues when being exploited for\nmalicious use. While recent studies primarily focus on probing toxic outputs\nthat can be easily detected with existing toxicity classifiers, we show that\nLLMs can generate diverse implicit toxic outputs that are exceptionally\ndifficult to detect via simply zero-shot prompting. Moreover, we propose a\nreinforcement learning (RL) based attacking method to further induce the\nimplicit toxicity in LLMs. Specifically, we optimize the language model with a\nreward that prefers implicit toxic outputs to explicit toxic and non-toxic\nones. Experiments on five widely-adopted toxicity classifiers demonstrate that\nthe attack success rate can be significantly improved through RL fine-tuning.\nFor instance, the RL-finetuned LLaMA-13B model achieves an attack success rate\nof 90.04% on BAD and 62.85% on Davinci003. Our findings suggest that LLMs pose\na significant threat in generating undetectable implicit toxic outputs. We\nfurther show that fine-tuning toxicity classifiers on the annotated examples\nfrom our attacking method can effectively enhance their ability to detect\nLLM-generated implicit toxic language. The code is publicly available at\nhttps://github.com/thu-coai/Implicit-Toxicity.\n","authors":["Jiaxin Wen","Pei Ke","Hao Sun","Zhexin Zhang","Chengfei Li","Jinfeng Bai","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2311.17391v1.pdf","comment":"EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2311.17376v1","updated":"2023-11-29T06:02:16Z","published":"2023-11-29T06:02:16Z","title":"CESAR: Automatic Induction of Compositional Instructions for Multi-turn\n Dialogs","summary":" Instruction-based multitasking has played a critical role in the success of\nlarge language models (LLMs) in multi-turn dialog applications. While publicly\navailable LLMs have shown promising performance, when exposed to complex\ninstructions with multiple constraints, they lag against state-of-the-art\nmodels like ChatGPT. In this work, we hypothesize that the availability of\nlarge-scale complex demonstrations is crucial in bridging this gap. Focusing on\ndialog applications, we propose a novel framework, CESAR, that unifies a large\nnumber of dialog tasks in the same format and allows programmatic induction of\ncomplex instructions without any manual effort.\n We apply CESAR on InstructDial, a benchmark for instruction-based dialog\ntasks. We further enhance InstructDial with new datasets and tasks and utilize\nCESAR to induce complex tasks with compositional instructions. This results in\na new benchmark called InstructDial++, which includes 63 datasets with 86 basic\ntasks and 68 composite tasks. Through rigorous experiments, we demonstrate the\nscalability of CESAR in providing rich instructions. Models trained on\nInstructDial++ can follow compositional prompts, such as prompts that ask for\nmultiple stylistic constraints.\n","authors":["Taha Aksu","Devamanyu Hazarika","Shikib Mehri","Seokhwan Kim","Dilek Hakkani-Tür","Yang Liu","Mahdi Namazifar"],"pdf_url":"https://arxiv.org/pdf/2311.17376v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.16444v2","updated":"2023-11-29T06:01:34Z","published":"2023-11-28T02:51:13Z","title":"Exo2EgoDVC: Dense Video Captioning of Egocentric Procedural Activities\n Using Web Instructional Videos","summary":" We propose a novel benchmark for cross-view knowledge transfer of dense video\ncaptioning, adapting models from web instructional videos with exocentric views\nto an egocentric view. While dense video captioning (predicting time segments\nand their captions) is primarily studied with exocentric videos (e.g.,\nYouCook2), benchmarks with egocentric videos are restricted due to data\nscarcity. To overcome the limited video availability, transferring knowledge\nfrom abundant exocentric web videos is demanded as a practical approach.\nHowever, learning the correspondence between exocentric and egocentric views is\ndifficult due to their dynamic view changes. The web videos contain mixed views\nfocusing on either human body actions or close-up hand-object interactions,\nwhile the egocentric view is constantly shifting as the camera wearer moves.\nThis necessitates the in-depth study of cross-view transfer under complex view\nchanges. In this work, we first create a real-life egocentric dataset (EgoYC2)\nwhose captions are shared with YouCook2, enabling transfer learning between\nthese datasets assuming their ground-truth is accessible. To bridge the view\ngaps, we propose a view-invariant learning method using adversarial training in\nboth the pre-training and fine-tuning stages. While the pre-training is\ndesigned to learn invariant features against the mixed views in the web videos,\nthe view-invariant fine-tuning further mitigates the view gaps between both\ndatasets. We validate our proposed method by studying how effectively it\novercomes the view change problem and efficiently transfers the knowledge to\nthe egocentric domain. Our benchmark pushes the study of the cross-view\ntransfer into a new task domain of dense video captioning and will envision\nmethodologies to describe egocentric videos in natural language.\n","authors":["Takehiko Ohkawa","Takuma Yagi","Taichi Nishimura","Ryosuke Furuta","Atsushi Hashimoto","Yoshitaka Ushiku","Yoichi Sato"],"pdf_url":"https://arxiv.org/pdf/2311.16444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17371v1","updated":"2023-11-29T05:54:41Z","published":"2023-11-29T05:54:41Z","title":"Are we going MAD? Benchmarking Multi-Agent Debate between Language\n Models for Medical Q&A","summary":" Recent advancements in large language models (LLMs) underscore their\npotential for responding to medical inquiries. However, ensuring that\ngenerative agents provide accurate and reliable answers remains an ongoing\nchallenge. In this context, multi-agent debate (MAD) has emerged as a prominent\nstrategy for enhancing the truthfulness of LLMs. In this work, we provide a\ncomprehensive benchmark of MAD strategies for medical Q&A, along with\nopen-source implementations. This explores the effective utilization of various\nstrategies including the trade-offs between cost, time, and accuracy. We build\nupon these insights to provide a novel debate-prompting strategy based on agent\nagreement that outperforms previously published strategies on medical Q&A\ntasks.\n","authors":["Andries Smit","Paul Duckworth","Nathan Grinsztajn","Kale-ab Tessera","Thomas D. Barrett","Arnu Pretorius"],"pdf_url":"https://arxiv.org/pdf/2311.17371v1.pdf","comment":"16 pages, 6 figures, NeurIPS DGM4H Workshop 2023"},{"id":"http://arxiv.org/abs/2310.18348v3","updated":"2023-11-29T05:32:24Z","published":"2023-10-23T04:35:58Z","title":"Meaning Representations from Trajectories in Autoregressive Models","summary":" We propose to extract meaning representations from autoregressive language\nmodels by considering the distribution of all possible trajectories extending\nan input text. This strategy is prompt-free, does not require fine-tuning, and\nis applicable to any pre-trained autoregressive model. Moreover, unlike\nvector-based representations, distribution-based representations can also model\nasymmetric relations (e.g., direction of logical entailment, hypernym/hyponym\nrelations) by using algebraic operations between likelihood functions. These\nideas are grounded in distributional perspectives on semantics and are\nconnected to standard constructions in automata theory, but to our knowledge\nthey have not been applied to modern language models. We empirically show that\nthe representations obtained from large models align well with human\nannotations, outperform other zero-shot and prompt-free methods on semantic\nsimilarity tasks, and can be used to solve more complex entailment and\ncontainment tasks that standard embeddings cannot handle. Finally, we extend\nour method to represent data from different modalities (e.g., image and text)\nusing multimodal autoregressive models. Our code is available at:\nhttps://github.com/tianyu139/meaning-as-trajectories\n","authors":["Tian Yu Liu","Matthew Trager","Alessandro Achille","Pramuditha Perera","Luca Zancato","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2310.18348v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17355v1","updated":"2023-11-29T05:04:52Z","published":"2023-11-29T05:04:52Z","title":"Are Large Language Models Good Fact Checkers: A Preliminary Study","summary":" Recently, Large Language Models (LLMs) have drawn significant attention due\nto their outstanding reasoning capabilities and extensive knowledge repository,\npositioning them as superior in handling various natural language processing\ntasks compared to other language models. In this paper, we present a\npreliminary investigation into the potential of LLMs in fact-checking. This\nstudy aims to comprehensively evaluate various LLMs in tackling specific\nfact-checking subtasks, systematically evaluating their capabilities, and\nconducting a comparative analysis of their performance against pre-trained and\nstate-of-the-art low-parameter models. Experiments demonstrate that LLMs\nachieve competitive performance compared to other small models in most\nscenarios. However, they encounter challenges in effectively handling Chinese\nfact verification and the entirety of the fact-checking pipeline due to\nlanguage inconsistencies and hallucinations. These findings underscore the need\nfor further exploration and research to enhance the proficiency of LLMs as\nreliable fact-checkers, unveiling the potential capability of LLMs and the\npossible challenges in fact-checking tasks.\n","authors":["Han Cao","Lingwei Wei","Mengyang Chen","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2311.17355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17352v1","updated":"2023-11-29T04:31:35Z","published":"2023-11-29T04:31:35Z","title":"Efficient Stitchable Task Adaptation","summary":" The paradigm of pre-training and fine-tuning has laid the foundation for\ndeploying deep learning models. However, most fine-tuning methods are designed\nto meet a specific resource budget. Recently, considering diverse deployment\nscenarios with various resource budgets, stitchable neural network (SN-Net) is\nintroduced to quickly obtain numerous new networks (stitches) from the\npre-trained models (anchors) in a model family via model stitching. Although\npromising, SN-Net confronts new challenges when adapting it to new target\ndomains, including huge memory and storage requirements and a long and\nsub-optimal multistage adaptation process. In this work, we present a novel\nframework, Efficient Stitchable Task Adaptation (ESTA), to efficiently produce\na palette of fine-tuned models that adhere to diverse resource constraints.\nSpecifically, we first tailor parameter-efficient fine-tuning to share low-rank\nupdates among the stitches while maintaining independent bias terms. In this\nway, we largely reduce fine-tuning memory burdens and mitigate the interference\namong stitches that arises in task adaptation. Furthermore, we streamline a\nsimple yet effective one-stage deployment pipeline, which estimates the\nimportant stitches to deploy with training-time gradient statistics. By\nassigning higher sampling probabilities to important stitches, we also get a\nboosted Pareto frontier. Extensive experiments on 25 downstream visual\nrecognition tasks demonstrate that our ESTA is capable of generating stitches\nwith smooth accuracy-efficiency trade-offs and surpasses the direct SN-Net\nadaptation by remarkable margins with significantly lower training time and\nfewer trainable parameters. Furthermore, we demonstrate the flexibility and\nscalability of our ESTA framework by stitching LLMs from LLaMA family,\nobtaining chatbot stitches of assorted sizes.\n","authors":["Haoyu He","Zizheng Pan","Jing Liu","Jianfei Cai","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2311.17352v1.pdf","comment":"Source code will be released at\n https://github.com/ziplab/Stitched_LLaMA"},{"id":"http://arxiv.org/abs/2311.17351v1","updated":"2023-11-29T04:25:15Z","published":"2023-11-29T04:25:15Z","title":"Exploring Large Language Models for Human Mobility Prediction under\n Public Events","summary":" Public events, such as concerts and sports games, can be major attractors for\nlarge crowds, leading to irregular surges in travel demand. Accurate human\nmobility prediction for public events is thus crucial for event planning as\nwell as traffic or crowd management. While rich textual descriptions about\npublic events are commonly available from online sources, it is challenging to\nencode such information in statistical or machine learning models. Existing\nmethods are generally limited in incorporating textual information, handling\ndata sparsity, or providing rationales for their predictions. To address these\nchallenges, we introduce a framework for human mobility prediction under public\nevents (LLM-MPE) based on Large Language Models (LLMs), leveraging their\nunprecedented ability to process textual data, learn from minimal examples, and\ngenerate human-readable explanations. Specifically, LLM-MPE first transforms\nraw, unstructured event descriptions from online sources into a standardized\nformat, and then segments historical mobility data into regular and\nevent-related components. A prompting strategy is designed to direct LLMs in\nmaking and rationalizing demand predictions considering historical mobility and\nevent features. A case study is conducted for Barclays Center in New York City,\nbased on publicly available event information and taxi trip data. Results show\nthat LLM-MPE surpasses traditional models, particularly on event days, with\ntextual data significantly enhancing its accuracy. Furthermore, LLM-MPE offers\ninterpretable insights into its predictions. Despite the great potential of\nLLMs, we also identify key challenges including misinformation and high costs\nthat remain barriers to their broader adoption in large-scale human mobility\nanalysis.\n","authors":["Yuebing Liang","Yichao Liu","Xiaohan Wang","Zhan Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.17351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17330v1","updated":"2023-11-29T03:07:00Z","published":"2023-11-29T03:07:00Z","title":"Biomedical knowledge graph-enhanced prompt generation for large language\n models","summary":" Large Language Models (LLMs) have been driving progress in AI at an\nunprecedented rate, yet still face challenges in knowledge-intensive domains\nlike biomedicine. Solutions such as pre-training and domain-specific\nfine-tuning add substantial computational overhead, and the latter require\ndomain-expertise. External knowledge infusion is task-specific and requires\nmodel training. Here, we introduce a task-agnostic Knowledge Graph-based\nRetrieval Augmented Generation (KG-RAG) framework by leveraging the massive\nbiomedical KG SPOKE with LLMs such as Llama-2-13b, GPT-3.5-Turbo and GPT-4, to\ngenerate meaningful biomedical text rooted in established knowledge. KG-RAG\nconsistently enhanced the performance of LLMs across various prompt types,\nincluding one-hop and two-hop prompts, drug repurposing queries, biomedical\ntrue/false questions, and multiple-choice questions (MCQ). Notably, KG-RAG\nprovides a remarkable 71% boost in the performance of the Llama-2 model on the\nchallenging MCQ dataset, demonstrating the framework's capacity to empower\nopen-source models with fewer parameters for domain-specific questions.\nFurthermore, KG-RAG enhanced the performance of proprietary GPT models, such as\nGPT-3.5 which exhibited improvement over GPT-4 in context utilization on MCQ\ndata. Our approach was also able to address drug repurposing questions,\nreturning meaningful repurposing suggestions. In summary, the proposed\nframework combines explicit and implicit knowledge of KG and LLM, respectively,\nin an optimized fashion, thus enhancing the adaptability of general-purpose\nLLMs to tackle domain-specific questions in a unified framework.\n","authors":["Karthik Soman","Peter W Rose","John H Morris","Rabia E Akbas","Brett Smith","Braian Peetoom","Catalina Villouta-Reyes","Gabriel Cerono","Yongmei Shi","Angela Rizk-Jackson","Sharat Israni","Charlotte A Nelson","Sui Huang","Sergio E Baranzini"],"pdf_url":"https://arxiv.org/pdf/2311.17330v1.pdf","comment":"28 pages, 5 figures, 2 tables, 1 supplementary file"},{"id":"http://arxiv.org/abs/2311.17311v1","updated":"2023-11-29T02:07:09Z","published":"2023-11-29T02:07:09Z","title":"Universal Self-Consistency for Large Language Model Generation","summary":" Self-consistency with chain-of-thought prompting (CoT) has demonstrated\nremarkable performance gains on various challenging tasks, by utilizing\nmultiple reasoning paths sampled from large language models (LLMs). However,\nself-consistency relies on the answer extraction process to aggregate multiple\nsolutions, which is not applicable to free-form answers. In this work, we\npropose Universal Self-Consistency (USC), which leverages LLMs themselves to\nselect the most consistent answer among multiple candidates. We evaluate USC on\na variety of benchmarks, including mathematical reasoning, code generation,\nlong-context summarization, and open-ended question answering. On open-ended\ngeneration tasks where the original self-consistency method is not applicable,\nUSC effectively utilizes multiple samples and improves the performance. For\nmathematical reasoning, USC matches the standard self-consistency performance\nwithout requiring the answer formats to be similar. Finally, without access to\nexecution results, USC also matches the execution-based voting performance on\ncode generation.\n","authors":["Xinyun Chen","Renat Aksitov","Uri Alon","Jie Ren","Kefan Xiao","Pengcheng Yin","Sushant Prakash","Charles Sutton","Xuezhi Wang","Denny Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.17311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17307v1","updated":"2023-11-29T01:59:38Z","published":"2023-11-29T01:59:38Z","title":"RoKEPG: RoBERTa and Knowledge Enhancement for Prescription Generation of\n Traditional Chinese Medicine","summary":" Traditional Chinese medicine (TCM) prescription is the most critical form of\nTCM treatment, and uncovering the complex nonlinear relationship between\nsymptoms and TCM is of great significance for clinical practice and assisting\nphysicians in diagnosis and treatment. Although there have been some studies on\nTCM prescription generation, these studies consider a single factor and\ndirectly model the symptom-prescription generation problem mainly based on\nsymptom descriptions, lacking guidance from TCM knowledge. To this end, we\npropose a RoBERTa and Knowledge Enhancement model for Prescription Generation\nof Traditional Chinese Medicine (RoKEPG). RoKEPG is firstly pre-trained by our\nconstructed TCM corpus, followed by fine-tuning the pre-trained model, and the\nmodel is guided to generate TCM prescriptions by introducing four classes of\nknowledge of TCM through the attention mask matrix. Experimental results on the\npublicly available TCM prescription dataset show that RoKEPG improves the F1\nmetric by about 2% over the baseline model with the best results.\n","authors":["Hua Pu","Jiacong Mi","Shan Lu","Jieyue He"],"pdf_url":"https://arxiv.org/pdf/2311.17307v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2311.16203v2","updated":"2023-11-29T01:53:46Z","published":"2023-11-27T08:52:10Z","title":"ChatTraffic: Text-to-Traffic Generation via Diffusion Model","summary":" Traffic prediction is one of the most significant foundations in Intelligent\nTransportation Systems (ITS). Traditional traffic prediction methods rely only\non historical traffic data to predict traffic trends and face two main\nchallenges. 1) insensitivity to unusual events. 2) poor performance in\nlong-term prediction. In this work, we explore how generative models combined\nwith text describing the traffic system can be applied for traffic generation\nand name the task Text-to-Traffic Generation (TTG). The key challenge of the\nTTG task is how to associate text with the spatial structure of the road\nnetwork and traffic data for generating traffic situations. To this end, we\npropose ChatTraffic, the first diffusion model for text-to-traffic generation.\nTo guarantee the consistency between synthetic and real data, we augment a\ndiffusion model with the Graph Convolutional Network (GCN) to extract spatial\ncorrelations of traffic data. In addition, we construct a large dataset\ncontaining text-traffic pairs for the TTG task. We benchmarked our model\nqualitatively and quantitatively on the released dataset. The experimental\nresults indicate that ChatTraffic can generate realistic traffic situations\nfrom the text. Our code and dataset are available at\nhttps://github.com/ChyaZhang/ChatTraffic.\n","authors":["Chengyang Zhang","Yong Zhang","Qitan Shao","Bo Li","Yisheng Lv","Xinglin Piao","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2311.16203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17301v1","updated":"2023-11-29T01:19:02Z","published":"2023-11-29T01:19:02Z","title":"Language Models: A Guide for the Perplexed","summary":" Given the growing importance of AI literacy, we decided to write this\ntutorial to help narrow the gap between the discourse among those who study\nlanguage models -- the core technology underlying ChatGPT and similar products\n-- and those who are intrigued and want to learn more about them. In short, we\nbelieve the perspective of researchers and educators can add some clarity to\nthe public's understanding of the technologies beyond what's currently\navailable, which tends to be either extremely technical or promotional material\ngenerated about products by their purveyors.\n Our approach teases apart the concept of a language model from products built\non them, from the behaviors attributed to or desired from those products, and\nfrom claims about similarity to human cognition. As a starting point, we (1)\noffer a scientific viewpoint that focuses on questions amenable to study\nthrough experimentation; (2) situate language models as they are today in the\ncontext of the research that led to their development; and (3) describe the\nboundaries of what is known about the models at this writing.\n","authors":["Sofia Serrano","Zander Brumbaugh","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2311.17301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17295v1","updated":"2023-11-29T00:45:23Z","published":"2023-11-29T00:45:23Z","title":"Elo Uncovered: Robustness and Best Practices in Language Model\n Evaluation","summary":" In Natural Language Processing (NLP), the Elo rating system, originally\ndesigned for ranking players in dynamic games such as chess, is increasingly\nbeing used to evaluate Large Language Models (LLMs) through \"A vs B\" paired\ncomparisons. However, while popular, the system's suitability for assessing\nentities with constant skill levels, such as LLMs, remains relatively\nunexplored. We study two fundamental axioms that evaluation methods should\nadhere to: reliability and transitivity. We conduct extensive evaluation of Elo\nbehaviour, illustrating that individual Elo computations exhibit volatility and\ndelving into the impact of varying the Elo rating system's hyperparameters. We\nshow that these axioms are not always satisfied raising questions about the\nreliability of current comparative evaluations of LLMs. If the current use of\nElo scores is intended to substitute the costly head-to-head comparison of\nLLMs, it is crucial to ensure the ranking is as robust as possible. Guided by\nthe axioms, our findings offer concrete guidelines for enhancing the\nreliability of LLM evaluation methods, suggesting a need for reassessment of\nexisting comparative approaches.\n","authors":["Meriem Boubdir","Edward Kim","Beyza Ermis","Sara Hooker","Marzieh Fadaee"],"pdf_url":"https://arxiv.org/pdf/2311.17295v1.pdf","comment":"22 pages, 7 figures, 2 tables. Revised version of the paper accepted\n at GEM Workshop, EMNLP 2023"},{"id":"http://arxiv.org/abs/2306.01286v2","updated":"2023-11-29T23:57:03Z","published":"2023-06-02T06:11:26Z","title":"KL-Divergence Guided Temperature Sampling","summary":" Temperature sampling is a conventional approach to diversify large language\nmodel predictions. As temperature increases, the prediction becomes diverse but\nalso vulnerable to hallucinations -- generating tokens that are sensible but\nnot factual. One common approach to mitigate hallucinations is to provide\nsource/grounding documents and the model is trained to produce predictions that\nbind to and are attributable to the provided source. It appears that there is a\ntrade-off between diversity and attribution. To mitigate any such trade-off, we\npropose to relax the constraint of having a fixed temperature over decoding\nsteps, and a mechanism to guide the dynamic temperature according to its\nrelevance to the source through KL-divergence. Our experiments justifies the\ntrade-off, and shows that our sampling algorithm outperforms the conventional\ntop-k and top-p algorithms in conversational question-answering and\nsummarization tasks.\n","authors":["Chung-Ching Chang","David Reitter","Renat Aksitov","Yun-Hsuan Sung"],"pdf_url":"https://arxiv.org/pdf/2306.01286v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18151v1","updated":"2023-11-29T23:45:57Z","published":"2023-11-29T23:45:57Z","title":"Uncertainty Guided Global Memory Improves Multi-Hop Question Answering","summary":" Transformers have become the gold standard for many natural language\nprocessing tasks and, in particular, for multi-hop question answering (MHQA).\nThis task includes processing a long document and reasoning over the multiple\nparts of it. The landscape of MHQA approaches can be classified into two\nprimary categories. The first group focuses on extracting supporting evidence,\nthereby constraining the QA model's context to predicted facts. Conversely, the\nsecond group relies on the attention mechanism of the long input encoding model\nto facilitate multi-hop reasoning. However, attention-based token\nrepresentations lack explicit global contextual information to connect\nreasoning steps. To address these issues, we propose GEMFormer, a two-stage\nmethod that first collects relevant information over the entire document to the\nmemory and then combines it with local context to solve the task. Our\nexperimental results show that fine-tuning a pre-trained model with\nmemory-augmented input, including the most certain global elements, improves\nthe model's performance on three MHQA datasets compared to the baseline. We\nalso found that the global explicit memory contains information from supporting\nfacts required for the correct answer.\n","authors":["Alsu Sagirova","Mikhail Burtsev"],"pdf_url":"https://arxiv.org/pdf/2311.18151v1.pdf","comment":"12 pages, 7 figures. EMNLP 2023. Our code is available at\n https://github.com/Aloriosa/GEMFormer"},{"id":"http://arxiv.org/abs/2305.04082v2","updated":"2023-11-29T23:25:19Z","published":"2023-05-06T16:05:27Z","title":"A Minimal Approach for Natural Language Action Space in Text-based Games","summary":" Text-based games (TGs) are language-based interactive environments for\nreinforcement learning. While language models (LMs) and knowledge graphs (KGs)\nare commonly used for handling large action space in TGs, it is unclear whether\nthese techniques are necessary or overused. In this paper, we revisit the\nchallenge of exploring the action space in TGs and propose $\n\\epsilon$-admissible exploration, a minimal approach of utilizing admissible\nactions, for training phase. Additionally, we present a text-based actor-critic\n(TAC) agent that produces textual commands for game, solely from game\nobservations, without requiring any KG or LM. Our method, on average across 10\ngames from Jericho, outperforms strong baselines and state-of-the-art agents\nthat use LM and KG. Our approach highlights that a much lighter model design,\nwith a fresh perspective on utilizing the information within the environments,\nsuffices for an effective exploration of exponentially large action spaces.\n","authors":["Dongwon Kelvin Ryu","Meng Fang","Shirui Pan","Gholamreza Haffari","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2305.04082v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18147v1","updated":"2023-11-29T23:20:17Z","published":"2023-11-29T23:20:17Z","title":"DisCGen: A Framework for Discourse-Informed Counterspeech Generation","summary":" Counterspeech can be an effective method for battling hateful content on\nsocial media. Automated counterspeech generation can aid in this process.\nGenerated counterspeech, however, can be viable only when grounded in the\ncontext of topic, audience and sensitivity as these factors influence both the\nefficacy and appropriateness. In this work, we propose a novel framework based\non theories of discourse to study the inferential links that connect counter\nspeeches to the hateful comment. Within this framework, we propose: i) a\ntaxonomy of counterspeech derived from discourse frameworks, and ii)\ndiscourse-informed prompting strategies for generating contextually-grounded\ncounterspeech. To construct and validate this framework, we present a process\nfor collecting an in-the-wild dataset of counterspeech from Reddit. Using this\nprocess, we manually annotate a dataset of 3.9k Reddit comment pairs for the\npresence of hatespeech and counterspeech. The positive pairs are annotated for\n10 classes in our proposed taxonomy. We annotate these pairs with paraphrased\ncounterparts to remove offensiveness and first-person references. We show that\nby using our dataset and framework, large language models can generate\ncontextually-grounded counterspeech informed by theories of discourse.\nAccording to our human evaluation, our approaches can act as a safeguard\nagainst critical failures of discourse-agnostic models.\n","authors":["Sabit Hassan","Malihe Alikhani"],"pdf_url":"https://arxiv.org/pdf/2311.18147v1.pdf","comment":"IJCNLP-AACL, 2023"},{"id":"http://arxiv.org/abs/2311.08592v2","updated":"2023-11-29T23:18:16Z","published":"2023-11-14T23:28:23Z","title":"AART: AI-Assisted Red-Teaming with Diverse Data Generation for New\n LLM-powered Applications","summary":" Adversarial testing of large language models (LLMs) is crucial for their safe\nand responsible deployment. We introduce a novel approach for automated\ngeneration of adversarial evaluation datasets to test the safety of LLM\ngenerations on new downstream applications. We call it AI-assisted Red-Teaming\n(AART) - an automated alternative to current manual red-teaming efforts. AART\noffers a data generation and augmentation pipeline of reusable and customizable\nrecipes that reduce human effort significantly and enable integration of\nadversarial testing earlier in new product development. AART generates\nevaluation datasets with high diversity of content characteristics critical for\neffective adversarial testing (e.g. sensitive and harmful concepts, specific to\na wide range of cultural and geographic regions and application scenarios). The\ndata generation is steered by AI-assisted recipes to define, scope and\nprioritize diversity within the application context. This feeds into a\nstructured LLM-generation process that scales up evaluation priorities.\nCompared to some state-of-the-art tools, AART shows promising results in terms\nof concept coverage and data quality.\n","authors":["Bhaktipriya Radharapu","Kevin Robinson","Lora Aroyo","Preethi Lahoti"],"pdf_url":"https://arxiv.org/pdf/2311.08592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18140v1","updated":"2023-11-29T23:03:04Z","published":"2023-11-29T23:03:04Z","title":"ROBBIE: Robust Bias Evaluation of Large Generative Language Models","summary":" As generative large language models (LLMs) grow more performant and\nprevalent, we must develop comprehensive enough tools to measure and improve\ntheir fairness. Different prompt-based datasets can be used to measure social\nbias across multiple text domains and demographic axes, meaning that testing\nLLMs on more datasets can potentially help us characterize their biases more\nfully, and better ensure equal and equitable treatment of marginalized\ndemographic groups. In this work, our focus is two-fold:\n (1) Benchmarking: a comparison of 6 different prompt-based bias and toxicity\nmetrics across 12 demographic axes and 5 families of generative LLMs. Out of\nthose 6 metrics, AdvPromptSet and HolisticBiasR are novel datasets proposed in\nthe paper. The comparison of those benchmarks gives us insights about the bias\nand toxicity of the compared models. Therefore, we explore the frequency of\ndemographic terms in common LLM pre-training corpora and how this may relate to\nmodel biases.\n (2) Mitigation: we conduct a comprehensive study of how well 3 bias/toxicity\nmitigation techniques perform across our suite of measurements. ROBBIE aims to\nprovide insights for practitioners while deploying a model, emphasizing the\nneed to not only measure potential harms, but also understand how they arise by\ncharacterizing the data, mitigate harms once found, and balance any trade-offs.\nWe open-source our analysis code in hopes of encouraging broader measurements\nof bias in future LLMs.\n","authors":["David Esiobu","Xiaoqing Tan","Saghar Hosseini","Megan Ung","Yuchen Zhang","Jude Fernandes","Jane Dwivedi-Yu","Eleonora Presani","Adina Williams","Eric Michael Smith"],"pdf_url":"https://arxiv.org/pdf/2311.18140v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.04076v2","updated":"2023-11-29T22:00:12Z","published":"2023-11-07T15:40:43Z","title":"Do LLMs exhibit human-like response biases? A case study in survey\n design","summary":" As large language models (LLMs) become more capable, there is growing\nexcitement about the possibility of using LLMs as proxies for humans in\nreal-world tasks where subjective labels are desired, such as in surveys and\nopinion polling. One widely-cited barrier to the adoption of LLMs is their\nsensitivity to prompt wording - but interestingly, humans also display\nsensitivities to instruction changes in the form of response biases. As such,\nwe argue that if LLMs are going to be used to approximate human opinions, it is\nnecessary to investigate the extent to which LLMs also reflect human response\nbiases, if at all. In this work, we use survey design as a case study, where\nhuman response biases caused by permutations in wordings of \"prompts\" have been\nextensively studied. Drawing from prior work in social psychology, we design a\ndataset and propose a framework to evaluate whether LLMs exhibit human-like\nresponse biases in survey questionnaires. Our comprehensive evaluation of nine\nmodels shows that popular open and commercial LLMs generally fail to reflect\nhuman-like behavior. These inconsistencies tend to be more prominent in models\nthat have been instruction fine-tuned. Furthermore, even if a model shows a\nsignificant change in the same direction as humans, we find that perturbations\nthat are not meant to elicit significant changes in humans may also result in a\nsimilar change. These results highlight the potential pitfalls of using LLMs to\nsubstitute humans in parts of the annotation pipeline, and further underscore\nthe importance of finer-grained characterizations of model behavior. Our code,\ndataset, and collected samples are available at\nhttps://github.com/lindiatjuatja/BiasMonkey\n","authors":["Lindia Tjuatja","Valerie Chen","Sherry Tongshuang Wu","Ameet Talwalkar","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2311.04076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10012v2","updated":"2023-11-29T21:59:11Z","published":"2023-06-16T17:58:58Z","title":"MagicBrush: A Manually Annotated Dataset for Instruction-Guided Image\n Editing","summary":" Text-guided image editing is widely needed in daily life, ranging from\npersonal use to professional applications such as Photoshop. However, existing\nmethods are either zero-shot or trained on an automatically synthesized\ndataset, which contains a high volume of noise. Thus, they still require lots\nof manual tuning to produce desirable outcomes in practice. To address this\nissue, we introduce MagicBrush (https://osu-nlp-group.github.io/MagicBrush/),\nthe first large-scale, manually annotated dataset for instruction-guided real\nimage editing that covers diverse scenarios: single-turn, multi-turn,\nmask-provided, and mask-free editing. MagicBrush comprises over 10K manually\nannotated triplets (source image, instruction, target image), which supports\ntrainining large-scale text-guided image editing models. We fine-tune\nInstructPix2Pix on MagicBrush and show that the new model can produce much\nbetter images according to human evaluation. We further conduct extensive\nexperiments to evaluate current image editing baselines from multiple\ndimensions including quantitative, qualitative, and human evaluations. The\nresults reveal the challenging nature of our dataset and the gap between\ncurrent baselines and real-world editing needs.\n","authors":["Kai Zhang","Lingbo Mo","Wenhu Chen","Huan Sun","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2306.10012v2.pdf","comment":"NeurIPS 2023; Website: https://osu-nlp-group.github.io/MagicBrush/"},{"id":"http://arxiv.org/abs/2305.06988v2","updated":"2023-11-29T21:24:35Z","published":"2023-05-11T17:23:00Z","title":"Self-Chained Image-Language Model for Video Localization and Question\n Answering","summary":" Recent studies have shown promising results on utilizing large pre-trained\nimage-language models for video question answering. While these image-language\nmodels can efficiently bootstrap the representation learning of video-language\nmodels, they typically concatenate uniformly sampled video frames as visual\ninputs without explicit language-aware, temporal modeling. When only a portion\nof a video input is relevant to the language query, such uniform frame sampling\ncan often lead to missing important visual cues. Although humans often find a\nvideo moment to focus on and rewind the moment to answer questions, training a\nquery-aware video moment localizer often requires expensive annotations and\nhigh computational costs. To address this issue, we propose Self-Chained Video\nLocalization-Answering (SeViLA), a novel framework that leverages a single\nimage-language model (BLIP-2) to tackle both temporal keyframe localization and\nQA on videos. SeViLA framework consists of two modules: Localizer and Answerer,\nwhere both are parameter-efficiently fine-tuned from BLIP-2. We propose two\nways of chaining these modules for cascaded inference and self-refinement.\nFirst, in the forward chain, the Localizer finds multiple language-aware\nkeyframes in a video, which the Answerer uses to predict the answer. Second, in\nthe reverse chain, the Answerer generates keyframe pseudo-labels to refine the\nLocalizer, alleviating the need for expensive video moment localization\nannotations. Our SeViLA framework outperforms several strong baselines on 5\nchallenging video QA and event prediction benchmarks, and achieves the\nstate-of-the-art in both fine-tuning (NExT-QA, STAR) and zero-shot (NExT-QA,\nSTAR, How2QA, VLEP) settings. We also analyze the impact of Localizer,\ncomparisons of Localizer with other temporal localization models,\npre-training/self-refinement of Localizer, and varying the number of keyframes.\n","authors":["Shoubin Yu","Jaemin Cho","Prateek Yadav","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2305.06988v2.pdf","comment":"NeurIPS 2023; Our code and checkpoints are available at:\n https://github.com/Yui010206/SeViLA"},{"id":"http://arxiv.org/abs/2311.18063v1","updated":"2023-11-29T20:22:44Z","published":"2023-11-29T20:22:44Z","title":"TurkishBERTweet: Fast and Reliable Large Language Model for Social Media\n Analysis","summary":" Turkish is one of the most popular languages in the world. Wide us of this\nlanguage on social media platforms such as Twitter, Instagram, or Tiktok and\nstrategic position of the country in the world politics makes it appealing for\nthe social network researchers and industry. To address this need, we introduce\nTurkishBERTweet, the first large scale pre-trained language model for Turkish\nsocial media built using almost 900 million tweets. The model shares the same\narchitecture as base BERT model with smaller input length, making\nTurkishBERTweet lighter than BERTurk and can have significantly lower inference\ntime. We trained our model using the same approach for RoBERTa model and\nevaluated on two text classification tasks: Sentiment Classification and Hate\nSpeech Detection. We demonstrate that TurkishBERTweet outperforms the other\navailable alternatives on generalizability and its lower inference time gives\nsignificant advantage to process large-scale datasets. We also compared our\nmodels with the commercial OpenAI solutions in terms of cost and performance to\ndemonstrate TurkishBERTweet is scalable and cost-effective solution. As part of\nour research, we released TurkishBERTweet and fine-tuned LoRA adapters for the\nmentioned tasks under the MIT License to facilitate future research and\napplications on Turkish social media. Our TurkishBERTweet model is available\nat: https://github.com/ViralLab/TurkishBERTweet\n","authors":["Ali Najafi","Onur Varol"],"pdf_url":"https://arxiv.org/pdf/2311.18063v1.pdf","comment":"21 pages, 4 figures, 8 tables"},{"id":"http://arxiv.org/abs/2311.18054v1","updated":"2023-11-29T20:04:57Z","published":"2023-11-29T20:04:57Z","title":"I Know You Did Not Write That! A Sampling Based Watermarking Method for\n Identifying Machine Generated Text","summary":" Potential harms of Large Language Models such as mass misinformation and\nplagiarism can be partially mitigated if there exists a reliable way to detect\nmachine generated text. In this paper, we propose a new watermarking method to\ndetect machine-generated texts. Our method embeds a unique pattern within the\ngenerated text, ensuring that while the content remains coherent and natural to\nhuman readers, it carries distinct markers that can be identified\nalgorithmically. Specifically, we intervene with the token sampling process in\na way which enables us to trace back our token choices during the detection\nphase. We show how watermarking affects textual quality and compare our\nproposed method with a state-of-the-art watermarking method in terms of\nrobustness and detectability. Through extensive experiments, we demonstrate the\neffectiveness of our watermarking scheme in distinguishing between watermarked\nand non-watermarked text, achieving high detection rates while maintaining\ntextual quality.\n","authors":["Kaan Efe Keleş","Ömer Kaan Gürbüz","Mucahid Kutlu"],"pdf_url":"https://arxiv.org/pdf/2311.18054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16867v2","updated":"2023-11-29T19:45:10Z","published":"2023-11-28T15:12:47Z","title":"The Falcon Series of Open Language Models","summary":" We introduce the Falcon series: 7B, 40B, and 180B parameters causal\ndecoder-only models trained on a diverse high-quality corpora predominantly\nassembled from web data. The largest model, Falcon-180B, has been trained on\nover 3.5 trillion tokens of text--the largest openly documented pretraining\nrun. Falcon-180B significantly outperforms models such as PaLM or Chinchilla,\nand improves upon concurrently developed models such as LLaMA 2 or\nInflection-1. It nears the performance of PaLM-2-Large at a reduced pretraining\nand inference cost, making it, to our knowledge, one of the three best language\nmodels in the world along with GPT-4 and PaLM-2-Large. We report detailed\nevaluations, as well as a deep dive into the methods and custom tooling\nemployed to pretrain Falcon. Notably, we report on our custom distributed\ntraining codebase, allowing us to efficiently pretrain these models on up to\n4,096 A100s on cloud AWS infrastructure with limited interconnect. We release a\n600B tokens extract of our web dataset, as well as the Falcon-7/40/180B models\nunder a permissive license to foster open-science and accelerate the\ndevelopment of an open ecosystem of large language models.\n","authors":["Ebtesam Almazrouei","Hamza Alobeidli","Abdulaziz Alshamsi","Alessandro Cappelli","Ruxandra Cojocaru","Mérouane Debbah","Étienne Goffinet","Daniel Hesslow","Julien Launay","Quentin Malartic","Daniele Mazzotta","Badreddine Noune","Baptiste Pannier","Guilherme Penedo"],"pdf_url":"https://arxiv.org/pdf/2311.16867v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18041v1","updated":"2023-11-29T19:34:34Z","published":"2023-11-29T19:34:34Z","title":"Zero-shot Conversational Summarization Evaluations with small Large\n Language Models","summary":" Large Language Models (LLMs) exhibit powerful summarization abilities.\nHowever, their capabilities on conversational summarization remains under\nexplored. In this work we evaluate LLMs (approx. 10 billion parameters) on\nconversational summarization and showcase their performance on various prompts.\nWe show that the summaries generated by models depend on the instructions and\nthe performance of LLMs vary with different instructions sometimes resulting\nsteep drop in ROUGE scores if prompts are not selected carefully. We also\nevaluate the models with human evaluations and discuss the limitations of the\nmodels on conversational summarization\n","authors":["Ramesh Manuvinakurike","Saurav Sahay","Sangeeta Manepalli","Lama Nachman"],"pdf_url":"https://arxiv.org/pdf/2311.18041v1.pdf","comment":"Accepted at RoF0Mo workshop at Neurips 2023"},{"id":"http://arxiv.org/abs/2311.18034v1","updated":"2023-11-29T19:20:14Z","published":"2023-11-29T19:20:14Z","title":"Hyperpolyglot LLMs: Cross-Lingual Interpretability in Token Embeddings","summary":" Cross-lingual transfer learning is an important property of multilingual\nlarge language models (LLMs). But how do LLMs represent relationships between\nlanguages? Every language model has an input layer that maps tokens to vectors.\nThis ubiquitous layer of language models is often overlooked. We find that\nsimilarities between these input embeddings are highly interpretable and that\nthe geometry of these embeddings differs between model families. In one case\n(XLM-RoBERTa), embeddings encode language: tokens in different writing systems\ncan be linearly separated with an average of 99.2% accuracy. Another family\n(mT5) represents cross-lingual semantic similarity: the 50 nearest neighbors\nfor any token represent an average of 7.61 writing systems, and are frequently\ntranslations. This result is surprising given that there is no explicit\nparallel cross-lingual training corpora and no explicit incentive for\ntranslations in pre-training objectives. Our research opens the door for\ninvestigations in 1) The effect of pre-training and model architectures on\nrepresentations of languages and 2) The applications of cross-lingual\nrepresentations embedded in language models.\n","authors":["Andrea W Wen-Yi","David Mimno"],"pdf_url":"https://arxiv.org/pdf/2311.18034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16733v2","updated":"2023-11-29T19:16:00Z","published":"2023-11-28T12:29:33Z","title":"LLMs for Science: Usage for Code Generation and Data Analysis","summary":" Large language models (LLMs) have been touted to enable increased\nproductivity in many areas of today's work life. Scientific research as an area\nof work is no exception: the potential of LLM-based tools to assist in the\ndaily work of scientists has become a highly discussed topic across\ndisciplines. However, we are only at the very onset of this subject of study.\nIt is still unclear how the potential of LLMs will materialise in research\npractice. With this study, we give first empirical evidence on the use of LLMs\nin the research process. We have investigated a set of use cases for LLM-based\ntools in scientific research, and conducted a first study to assess to which\ndegree current tools are helpful. In this paper we report specifically on use\ncases related to software engineering, such as generating application code and\ndeveloping scripts for data analytics. While we studied seemingly simple use\ncases, results across tools differ significantly. Our results highlight the\npromise of LLM-based tools in general, yet we also observe various issues,\nparticularly regarding the integrity of the output these tools provide.\n","authors":["Mohamed Nejjar","Luca Zacharias","Fabian Stiehle","Ingo Weber"],"pdf_url":"https://arxiv.org/pdf/2311.16733v2.pdf","comment":"Preprint; In Submission"},{"id":"http://arxiv.org/abs/2311.18028v1","updated":"2023-11-29T19:11:55Z","published":"2023-11-29T19:11:55Z","title":"Filtered Semi-Markov CRF","summary":" Semi-Markov CRF has been proposed as an alternative to the traditional Linear\nChain CRF for text segmentation tasks such as Named Entity Recognition (NER).\nUnlike CRF, which treats text segmentation as token-level prediction, Semi-CRF\nconsiders segments as the basic unit, making it more expressive. However,\nSemi-CRF suffers from two major drawbacks: (1) quadratic complexity over\nsequence length, as it operates on every span of the input sequence, and (2)\ninferior performance compared to CRF for sequence labeling tasks like NER. In\nthis paper, we introduce Filtered Semi-Markov CRF, a variant of Semi-CRF that\naddresses these issues by incorporating a filtering step to eliminate\nirrelevant segments, reducing complexity and search space. Our approach is\nevaluated on several NER benchmarks, where it outperforms both CRF and Semi-CRF\nwhile being significantly faster. The implementation of our method is available\non \\href{https://github.com/urchade/Filtered-Semi-Markov-CRF}{Github}.\n","authors":["Urchade Zaratiana","Nadi Tomeh","Niama El Khbir","Pierre Holat","Thierry Charnois"],"pdf_url":"https://arxiv.org/pdf/2311.18028v1.pdf","comment":"EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2310.17703v2","updated":"2023-11-29T19:10:58Z","published":"2023-10-26T18:03:46Z","title":"The impact of responding to patient messages with large language model\n assistance","summary":" Documentation burden is a major contributor to clinician burnout, which is\nrising nationally and is an urgent threat to our ability to care for patients.\nArtificial intelligence (AI) chatbots, such as ChatGPT, could reduce clinician\nburden by assisting with documentation. Although many hospitals are actively\nintegrating such systems into electronic medical record systems, AI chatbots\nutility and impact on clinical decision-making have not been studied for this\nintended use. We are the first to examine the utility of large language models\nin assisting clinicians draft responses to patient questions. In our two-stage\ncross-sectional study, 6 oncologists responded to 100 realistic synthetic\ncancer patient scenarios and portal messages developed to reflect common\nmedical situations, first manually, then with AI assistance.\n We find AI-assisted responses were longer, less readable, but provided\nacceptable drafts without edits 58% of time. AI assistance improved efficiency\n77% of time, with low harm risk (82% safe). However, 7.7% unedited AI responses\ncould severely harm. In 31% cases, physicians thought AI drafts were\nhuman-written. AI assistance led to more patient education recommendations,\nfewer clinical actions than manual responses. Results show promise for AI to\nimprove clinician efficiency and patient care through assisting documentation,\nif used judiciously. Monitoring model outputs and human-AI interaction remains\ncrucial for safe implementation.\n","authors":["Shan Chen","Marco Guevara","Shalini Moningi","Frank Hoebers","Hesham Elhalawani","Benjamin H. Kann","Fallon E. Chipidza","Jonathan Leeman","Hugo J. W. L. Aerts","Timothy Miller","Guergana K. Savova","Raymond H. Mak","Maryam Lustberg","Majid Afshar","Danielle S. Bitterman"],"pdf_url":"https://arxiv.org/pdf/2310.17703v2.pdf","comment":"4 figures and tables in main, submitted for review"},{"id":"http://arxiv.org/abs/2311.17972v1","updated":"2023-11-29T16:02:06Z","published":"2023-11-29T16:02:06Z","title":"Self-Infilling Code Generation","summary":" This work introduces a general code generation framework that incorporates\ninfilling operations into auto-regressive decoding. Our approach capitalizes on\nthe observation that recent code language models with infilling capabilities\ncan perform \\emph{self-infilling}: whereas infilling operations aim to fill in\nthe middle based on a predefined prefix and suffix, self-infilling sequentially\ngenerates both such surrounding context and the infilled content. We utilize\nthis feature to develop an infilling-augmented decoding process that\nfacilitates non-monotonic generation. This approach allows for postponing the\ngeneration of uncertain code snippets until a definitive suffix is established,\nleading to improved control over the generation sequence. In addition, it\nfacilitates a looping mechanism, which can iteratively update and synchronize\neach piece of generation in a cyclic manner. Extensive experiments are\nconducted to demonstrate that our proposed decoding process is effective in\nenhancing regularity and quality across several code generation benchmarks.\n","authors":["Lin Zheng","Jianbo Yuan","Zhi Zhang","Hongxia Yang","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2311.17972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17946v1","updated":"2023-11-29T03:42:16Z","published":"2023-11-29T03:42:16Z","title":"DreamSync: Aligning Text-to-Image Generation with Image Understanding\n Feedback","summary":" Despite their wide-spread success, Text-to-Image models (T2I) still struggle\nto produce images that are both aesthetically pleasing and faithful to the\nuser's input text. We introduce DreamSync, a model-agnostic training algorithm\nby design that improves T2I models to be faithful to the text input. DreamSync\nbuilds off a recent insight from TIFA's evaluation framework -- that large\nvision-language models (VLMs) can effectively identify the fine-grained\ndiscrepancies between generated images and the text inputs. DreamSync uses this\ninsight to train T2I models without any labeled data; it improves T2I models\nusing its own generations. First, it prompts the model to generate several\ncandidate images for a given input text. Then, it uses two VLMs to select the\nbest generation: a Visual Question Answering model that measures the alignment\nof generated images to the text, and another that measures the generation's\naesthetic quality. After selection, we use LoRA to iteratively finetune the T2I\nmodel to guide its generation towards the selected best generations. DreamSync\ndoes not need any additional human annotation. model architecture changes, or\nreinforcement learning. Despite its simplicity, DreamSync improves both the\nsemantic alignment and aesthetic appeal of two diffusion-based T2I models,\nevidenced by multiple benchmarks (+1.7% on TIFA, +2.9% on DSG1K, +3.4% on VILA\naesthetic) and human evaluation.\n","authors":["Jiao Sun","Deqing Fu","Yushi Hu","Su Wang","Royi Rassin","Da-Cheng Juan","Dana Alon","Charles Herrmann","Sjoerd van Steenkiste","Ranjay Krishna","Cyrus Rashtchian"],"pdf_url":"https://arxiv.org/pdf/2311.17946v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.17919v1","updated":"2023-11-29T18:59:59Z","published":"2023-11-29T18:59:59Z","title":"Visual Anagrams: Generating Multi-View Optical Illusions with Diffusion\n Models","summary":" We address the problem of synthesizing multi-view optical illusions: images\nthat change appearance upon a transformation, such as a flip or rotation. We\npropose a simple, zero-shot method for obtaining these illusions from\noff-the-shelf text-to-image diffusion models. During the reverse diffusion\nprocess, we estimate the noise from different views of a noisy image. We then\ncombine these noise estimates together and denoise the image. A theoretical\nanalysis suggests that this method works precisely for views that can be\nwritten as orthogonal transformations, of which permutations are a subset. This\nleads to the idea of a visual anagram--an image that changes appearance under\nsome rearrangement of pixels. This includes rotations and flips, but also more\nexotic pixel permutations such as a jigsaw rearrangement. Our approach also\nnaturally extends to illusions with more than two views. We provide both\nqualitative and quantitative results demonstrating the effectiveness and\nflexibility of our method. Please see our project webpage for additional\nvisualizations and results: https://dangeng.github.io/visual_anagrams/\n","authors":["Daniel Geng","Inbum Park","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2311.17919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17921v1","updated":"2023-11-29T18:59:59Z","published":"2023-11-29T18:59:59Z","title":"Do text-free diffusion models learn discriminative visual\n representations?","summary":" While many unsupervised learning models focus on one family of tasks, either\ngenerative or discriminative, we explore the possibility of a unified\nrepresentation learner: a model which addresses both families of tasks\nsimultaneously. We identify diffusion models, a state-of-the-art method for\ngenerative tasks, as a prime candidate. Such models involve training a U-Net to\niteratively predict and remove noise, and the resulting model can synthesize\nhigh-fidelity, diverse, novel images. We find that the intermediate feature\nmaps of the U-Net are diverse, discriminative feature representations. We\npropose a novel attention mechanism for pooling feature maps and further\nleverage this mechanism as DifFormer, a transformer feature fusion of features\nfrom different diffusion U-Net blocks and noise steps. We also develop DifFeed,\na novel feedback mechanism tailored to diffusion. We find that diffusion models\nare better than GANs, and, with our fusion and feedback mechanisms, can compete\nwith state-of-the-art unsupervised image representation learning methods for\ndiscriminative tasks - image classification with full and semi-supervision,\ntransfer for fine-grained classification, object detection and segmentation,\nand semantic segmentation. Our project website\n(https://mgwillia.github.io/diffssl/) and code\n(https://github.com/soumik-kanad/diffssl) are available publicly.\n","authors":["Soumik Mukhopadhyay","Matthew Gwilliam","Yosuke Yamaguchi","Vatsal Agarwal","Namitha Padmanabhan","Archana Swaminathan","Tianyi Zhou","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2311.17921v1.pdf","comment":"Website: see https://mgwillia.github.io/diffssl/ . Code: see\n https://github.com/soumik-kanad/diffssl . The first two authors contributed\n equally. 15 pages, 9 figures, 15 tables. Submission under review. arXiv admin\n note: text overlap with arXiv:2307.08702"},{"id":"http://arxiv.org/abs/2311.17922v1","updated":"2023-11-29T18:59:59Z","published":"2023-11-29T18:59:59Z","title":"A Simple Recipe for Language-guided Domain Generalized Segmentation","summary":" Generalization to new domains not seen during training is one of the\nlong-standing goals and challenges in deploying neural networks in real-world\napplications. Existing generalization techniques necessitate substantial data\naugmentation, potentially sourced from external datasets, and aim at learning\ninvariant representations by imposing various alignment constraints.\nLarge-scale pretraining has recently shown promising generalization\ncapabilities, along with the potential of bridging different modalities. For\ninstance, the recent advent of vision-language models like CLIP has opened the\ndoorway for vision models to exploit the textual modality. In this paper, we\nintroduce a simple framework for generalizing semantic segmentation networks by\nemploying language as the source of randomization. Our recipe comprises three\nkey ingredients: i) the preservation of the intrinsic CLIP robustness through\nminimal fine-tuning, ii) language-driven local style augmentation, and iii)\nrandomization by locally mixing the source and augmented styles during\ntraining. Extensive experiments report state-of-the-art results on various\ngeneralization benchmarks. The code will be made available.\n","authors":["Mohammad Fahes","Tuan-Hung Vu","Andrei Bursuc","Patrick Pérez","Raoul de Charette"],"pdf_url":"https://arxiv.org/pdf/2311.17922v1.pdf","comment":"Project page: https://astra-vision.github.io/FAMix"},{"id":"http://arxiv.org/abs/2311.17918v1","updated":"2023-11-29T18:59:47Z","published":"2023-11-29T18:59:47Z","title":"Driving into the Future: Multiview Visual Forecasting and Planning with\n World Model for Autonomous Driving","summary":" In autonomous driving, predicting future events in advance and evaluating the\nforeseeable risks empowers autonomous vehicles to better plan their actions,\nenhancing safety and efficiency on the road. To this end, we propose Drive-WM,\nthe first driving world model compatible with existing end-to-end planning\nmodels. Through a joint spatial-temporal modeling facilitated by view\nfactorization, our model generates high-fidelity multiview videos in driving\nscenes. Building on its powerful generation ability, we showcase the potential\nof applying the world model for safe driving planning for the first time.\nParticularly, our Drive-WM enables driving into multiple futures based on\ndistinct driving maneuvers, and determines the optimal trajectory according to\nthe image-based rewards. Evaluation on real-world driving datasets verifies\nthat our method could generate high-quality, consistent, and controllable\nmultiview videos, opening up possibilities for real-world simulations and safe\nplanning.\n","authors":["Yuqi Wang","Jiawei He","Lue Fan","Hongxin Li","Yuntao Chen","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.17918v1.pdf","comment":"Project page: https://drive-wm.github.io. Code:\n https://github.com/BraveGroup/Drive-WM"},{"id":"http://arxiv.org/abs/2311.17917v1","updated":"2023-11-29T18:59:32Z","published":"2023-11-29T18:59:32Z","title":"AvatarStudio: High-fidelity and Animatable 3D Avatar Creation from Text","summary":" We study the problem of creating high-fidelity and animatable 3D avatars from\nonly textual descriptions. Existing text-to-avatar methods are either limited\nto static avatars which cannot be animated or struggle to generate animatable\navatars with promising quality and precise pose control. To address these\nlimitations, we propose AvatarStudio, a coarse-to-fine generative model that\ngenerates explicit textured 3D meshes for animatable human avatars.\nSpecifically, AvatarStudio begins with a low-resolution NeRF-based\nrepresentation for coarse generation, followed by incorporating SMPL-guided\narticulation into the explicit mesh representation to support avatar animation\nand high resolution rendering. To ensure view consistency and pose\ncontrollability of the resulting avatars, we introduce a 2D diffusion model\nconditioned on DensePose for Score Distillation Sampling supervision. By\neffectively leveraging the synergy between the articulated mesh representation\nand the DensePose-conditional diffusion model, AvatarStudio can create\nhigh-quality avatars from text that are ready for animation, significantly\noutperforming previous methods. Moreover, it is competent for many\napplications, e.g., multimodal avatar animations and style-guided avatar\ncreation. For more results, please refer to our project page:\nhttp://jeff95.me/projects/avatarstudio.html\n","authors":["Jianfeng Zhang","Xuanmeng Zhang","Huichao Zhang","Jun Hao Liew","Chenxu Zhang","Yi Yang","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2311.17917v1.pdf","comment":"Project page at http://jeff95.me/projects/avatarstudio.html"},{"id":"http://arxiv.org/abs/2311.17911v1","updated":"2023-11-29T18:57:07Z","published":"2023-11-29T18:57:07Z","title":"OPERA: Alleviating Hallucination in Multi-Modal Large Language Models\n via Over-Trust Penalty and Retrospection-Allocation","summary":" Hallucination, posed as a pervasive challenge of multi-modal large language\nmodels (MLLMs), has significantly impeded their real-world usage that demands\nprecise judgment. Existing methods mitigate this issue with either training\nwith specific designed data or inferencing with external knowledge from other\nsources, incurring inevitable additional costs. In this paper, we present\nOPERA, a novel MLLM decoding method grounded in an Over-trust Penalty and a\nRetrospection-Allocation strategy, serving as a nearly free lunch to alleviate\nthe hallucination issue without additional data, knowledge, or training. Our\napproach begins with an interesting observation that, most hallucinations are\nclosely tied to the knowledge aggregation patterns manifested in the\nself-attention matrix, i.e., MLLMs tend to generate new tokens by focusing on a\nfew summary tokens, but not all the previous tokens. Such partial over-trust\ninclination results in the neglecting of image tokens and describes the image\ncontent with hallucination. Statistically, we observe an 80%$\\sim$95%\nco-currency rate between hallucination contents and such knowledge aggregation\npatterns. Based on the observation, OPERA introduces a penalty term on the\nmodel logits during the beam-search decoding to mitigate the over-trust issue,\nalong with a rollback strategy that retrospects the presence of summary tokens\nin the previously generated tokens, and re-allocate the token selection if\nnecessary. With extensive experiments, OPERA shows significant\nhallucination-mitigating performance on different MLLMs and metrics, proving\nits effectiveness and generality. Our code is available at:\nhttps://github.com/shikiw/OPERA.\n","authors":["Qidong Huang","Xiaoyi Dong","Pan Zhang","Bin Wang","Conghui He","Jiaqi Wang","Dahua Lin","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2311.17911v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2311.17910v1","updated":"2023-11-29T18:56:32Z","published":"2023-11-29T18:56:32Z","title":"HUGS: Human Gaussian Splats","summary":" Recent advances in neural rendering have improved both training and rendering\ntimes by orders of magnitude. While these methods demonstrate state-of-the-art\nquality and speed, they are designed for photogrammetry of static scenes and do\nnot generalize well to freely moving humans in the environment. In this work,\nwe introduce Human Gaussian Splats (HUGS) that represents an animatable human\ntogether with the scene using 3D Gaussian Splatting (3DGS). Our method takes\nonly a monocular video with a small number of (50-100) frames, and it\nautomatically learns to disentangle the static scene and a fully animatable\nhuman avatar within 30 minutes. We utilize the SMPL body model to initialize\nthe human Gaussians. To capture details that are not modeled by SMPL (e.g.\ncloth, hairs), we allow the 3D Gaussians to deviate from the human body model.\nUtilizing 3D Gaussians for animated humans brings new challenges, including the\nartifacts created when articulating the Gaussians. We propose to jointly\noptimize the linear blend skinning weights to coordinate the movements of\nindividual Gaussians during animation. Our approach enables novel-pose\nsynthesis of human and novel view synthesis of both the human and the scene. We\nachieve state-of-the-art rendering quality with a rendering speed of 60 FPS\nwhile being ~100x faster to train over previous work. Our code will be\nannounced here: https://github.com/apple/ml-hugs\n","authors":["Muhammed Kocabas","Jen-Hao Rick Chang","James Gabriel","Oncel Tuzel","Anurag Ranjan"],"pdf_url":"https://arxiv.org/pdf/2311.17910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17907v1","updated":"2023-11-29T18:55:38Z","published":"2023-11-29T18:55:38Z","title":"CG3D: Compositional Generation for Text-to-3D via Gaussian Splatting","summary":" With the onset of diffusion-based generative models and their ability to\ngenerate text-conditioned images, content generation has received a massive\ninvigoration. Recently, these models have been shown to provide useful guidance\nfor the generation of 3D graphics assets. However, existing work in\ntext-conditioned 3D generation faces fundamental constraints: (i) inability to\ngenerate detailed, multi-object scenes, (ii) inability to textually control\nmulti-object configurations, and (iii) physically realistic scene composition.\nIn this work, we propose CG3D, a method for compositionally generating scalable\n3D assets that resolves these constraints. We find that explicit Gaussian\nradiance fields, parameterized to allow for compositions of objects, possess\nthe capability to enable semantically and physically consistent scenes. By\nutilizing a guidance framework built around this explicit representation, we\nshow state of the art results, capable of even exceeding the guiding diffusion\nmodel in terms of object combinations and physics accuracy.\n","authors":["Alexander Vilesov","Pradyumna Chari","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2311.17907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17902v1","updated":"2023-11-29T18:53:47Z","published":"2023-11-29T18:53:47Z","title":"Language-conditioned Detection Transformer","summary":" We present a new open-vocabulary detection framework. Our framework uses both\nimage-level labels and detailed detection annotations when available. Our\nframework proceeds in three steps. We first train a language-conditioned object\ndetector on fully-supervised detection data. This detector gets to see the\npresence or absence of ground truth classes during training, and conditions\nprediction on the set of present classes. We use this detector to pseudo-label\nimages with image-level labels. Our detector provides much more accurate\npseudo-labels than prior approaches with its conditioning mechanism. Finally,\nwe train an unconditioned open-vocabulary detector on the pseudo-annotated\nimages. The resulting detector, named DECOLA, shows strong zero-shot\nperformance in open-vocabulary LVIS benchmark as well as direct zero-shot\ntransfer benchmarks on LVIS, COCO, Object365, and OpenImages. DECOLA\noutperforms the prior arts by 17.1 AP-rare and 9.4 mAP on zero-shot LVIS\nbenchmark. DECOLA achieves state-of-the-art results in various model sizes,\narchitectures, and datasets by only training on open-sourced data and\nacademic-scale computing. Code is available at\nhttps://github.com/janghyuncho/DECOLA.\n","authors":["Jang Hyun Cho","Philipp Krähenbühl"],"pdf_url":"https://arxiv.org/pdf/2311.17902v1.pdf","comment":"Code is at https://github.com/janghyuncho/DECOLA"},{"id":"http://arxiv.org/abs/2311.17901v1","updated":"2023-11-29T18:53:34Z","published":"2023-11-29T18:53:34Z","title":"SODA: Bottleneck Diffusion Models for Representation Learning","summary":" We introduce SODA, a self-supervised diffusion model, designed for\nrepresentation learning. The model incorporates an image encoder, which\ndistills a source view into a compact representation, that, in turn, guides the\ngeneration of related novel views. We show that by imposing a tight bottleneck\nbetween the encoder and a denoising decoder, and leveraging novel view\nsynthesis as a self-supervised objective, we can turn diffusion models into\nstrong representation learners, capable of capturing visual semantics in an\nunsupervised manner. To the best of our knowledge, SODA is the first diffusion\nmodel to succeed at ImageNet linear-probe classification, and, at the same\ntime, it accomplishes reconstruction, editing and synthesis tasks across a wide\nrange of datasets. Further investigation reveals the disentangled nature of its\nemergent latent space, that serves as an effective interface to control and\nmanipulate the model's produced images. All in all, we aim to shed light on the\nexciting and promising potential of diffusion models, not only for image\ngeneration, but also for learning rich and robust representations.\n","authors":["Drew A. Hudson","Daniel Zoran","Mateusz Malinowski","Andrew K. Lampinen","Andrew Jaegle","James L. McClelland","Loic Matthey","Felix Hill","Alexander Lerchner"],"pdf_url":"https://arxiv.org/pdf/2311.17901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17898v1","updated":"2023-11-29T18:51:46Z","published":"2023-11-29T18:51:46Z","title":"Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis","summary":" Hallucinations and unfaithful synthesis due to inaccurate prompts with\ninsufficient semantic details are widely observed in multimodal generative\nmodels. A prevalent strategy to align multiple modalities is to fine-tune the\ngenerator with a large number of annotated text-image pairs. However, such a\nprocedure is labor-consuming and resource-draining. The key question we ask is:\ncan we enhance the quality and faithfulness of text-driven generative models\nbeyond extensive text-image pair annotations? To address this question, we\npropose Knowledge Pursuit Prompting (KPP), a zero-shot framework that\niteratively incorporates external knowledge to help generators produce reliable\nvisual content. Instead of training generators to handle generic prompts, KPP\nemploys a recursive knowledge query process to gather informative external\nfacts from the knowledge base, instructs a language model to compress the\nacquired knowledge for prompt refinement, and utilizes text-driven generators\nfor visual synthesis. The entire process is zero-shot, without accessing the\narchitectures and parameters of generative models. We evaluate the framework\nacross multiple text-driven generative tasks (image, 3D rendering, and video)\non datasets of different domains. We further demonstrate the extensibility and\nadaptability of KPP through varying foundation model bases and instructions.\nOur results show that KPP is capable of generating faithful and semantically\nrich content across diverse visual domains, offering a promising solution to\nimprove multimodal generative models.\n","authors":["Jinqi Luo","Kwan Ho Ryan Chan","Dimitris Dimos","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2311.17898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17893v1","updated":"2023-11-29T18:47:17Z","published":"2023-11-29T18:47:17Z","title":"Betrayed by Attention: A Simple yet Effective Approach for\n Self-supervised Video Object Segmentation","summary":" In this paper, we propose a simple yet effective approach for self-supervised\nvideo object segmentation (VOS). Our key insight is that the inherent\nstructural dependencies present in DINO-pretrained Transformers can be\nleveraged to establish robust spatio-temporal correspondences in videos.\nFurthermore, simple clustering on this correspondence cue is sufficient to\nyield competitive segmentation results. Previous self-supervised VOS techniques\nmajorly resort to auxiliary modalities or utilize iterative slot attention to\nassist in object discovery, which restricts their general applicability and\nimposes higher computational requirements. To deal with these challenges, we\ndevelop a simplified architecture that capitalizes on the emerging objectness\nfrom DINO-pretrained Transformers, bypassing the need for additional modalities\nor slot attention. Specifically, we first introduce a single spatio-temporal\nTransformer block to process the frame-wise DINO features and establish\nspatio-temporal dependencies in the form of self-attention. Subsequently,\nutilizing these attention maps, we implement hierarchical clustering to\ngenerate object segmentation masks. To train the spatio-temporal block in a\nfully self-supervised manner, we employ semantic and dynamic motion consistency\ncoupled with entropy normalization. Our method demonstrates state-of-the-art\nperformance across multiple unsupervised VOS benchmarks and particularly excels\nin complex real-world multi-object video segmentation tasks such as\nDAVIS-17-Unsupervised and YouTube-VIS-19. The code and model checkpoints will\nbe released at https://github.com/shvdiwnkozbw/SSL-UVOS.\n","authors":["Shuangrui Ding","Rui Qian","Haohang Xu","Dahua Lin","Hongkai Xiong"],"pdf_url":"https://arxiv.org/pdf/2311.17893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17891v1","updated":"2023-11-29T18:44:12Z","published":"2023-11-29T18:44:12Z","title":"Pose Anything: A Graph-Based Approach for Category-Agnostic Pose\n Estimation","summary":" Traditional 2D pose estimation models are limited by their category-specific\ndesign, making them suitable only for predefined object categories. This\nrestriction becomes particularly challenging when dealing with novel objects\ndue to the lack of relevant training data.\n To address this limitation, category-agnostic pose estimation (CAPE) was\nintroduced. CAPE aims to enable keypoint localization for arbitrary object\ncategories using a single model, requiring minimal support images with\nannotated keypoints. This approach not only enables object pose generation\nbased on arbitrary keypoint definitions but also significantly reduces the\nassociated costs, paving the way for versatile and adaptable pose estimation\napplications.\n We present a novel approach to CAPE that leverages the inherent geometrical\nrelations between keypoints through a newly designed Graph Transformer Decoder.\nBy capturing and incorporating this crucial structural information, our method\nenhances the accuracy of keypoint localization, marking a significant departure\nfrom conventional CAPE techniques that treat keypoints as isolated entities.\n We validate our approach on the MP-100 benchmark, a comprehensive dataset\ncomprising over 20,000 images spanning more than 100 categories. Our method\noutperforms the prior state-of-the-art by substantial margins, achieving\nremarkable improvements of 2.16% and 1.82% under 1-shot and 5-shot settings,\nrespectively. Furthermore, our method's end-to-end training demonstrates both\nscalability and efficiency compared to previous CAPE approaches.\n","authors":["Or Hirschorn","Shai Avidan"],"pdf_url":"https://arxiv.org/pdf/2311.17891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17878v1","updated":"2023-11-29T18:23:18Z","published":"2023-11-29T18:23:18Z","title":"TSDF-Sampling: Efficient Sampling for Neural Surface Field using\n Truncated Signed Distance Field","summary":" Multi-view neural surface reconstruction has exhibited impressive results.\nHowever, a notable limitation is the prohibitively slow inference time when\ncompared to traditional techniques, primarily attributed to the dense sampling,\nrequired to maintain the rendering quality. This paper introduces a novel\napproach that substantially reduces the number of samplings by incorporating\nthe Truncated Signed Distance Field (TSDF) of the scene. While prior works have\nproposed importance sampling, their dependence on initial uniform samples over\nthe entire space makes them unable to avoid performance degradation when trying\nto use less number of samples. In contrast, our method leverages the TSDF\nvolume generated only by the trained views, and it proves to provide a\nreasonable bound on the sampling from upcoming novel views. As a result, we\nachieve high rendering quality by fully exploiting the continuous neural SDF\nestimation within the bounds given by the TSDF volume. Notably, our method is\nthe first approach that can be robustly plug-and-play into a diverse array of\nneural surface field models, as long as they use the volume rendering\ntechnique. Our empirical results show an 11-fold increase in inference speed\nwithout compromising performance. The result videos are available at our\nproject page: https://tsdf-sampling.github.io/\n","authors":["Chaerin Min","Sehyun Cha","Changhee Won","Jongwoo Lim"],"pdf_url":"https://arxiv.org/pdf/2311.17878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17876v1","updated":"2023-11-29T18:21:24Z","published":"2023-11-29T18:21:24Z","title":"Enhancing Post-Hoc Explanation Benchmark Reliability for Image\n Classification","summary":" Deep neural networks, while powerful for image classification, often operate\nas \"black boxes,\" complicating the understanding of their decision-making\nprocesses. Various explanation methods, particularly those generating saliency\nmaps, aim to address this challenge. However, the inconsistency issues of\nfaithfulness metrics hinder reliable benchmarking of explanation methods. This\npaper employs an approach inspired by psychometrics, utilizing Krippendorf's\nalpha to quantify the benchmark reliability of post-hoc methods in image\nclassification. The study proposes model training modifications, including\nfeeding perturbed samples and employing focal loss, to enhance robustness and\ncalibration. Empirical evaluations demonstrate significant improvements in\nbenchmark reliability across metrics, datasets, and post-hoc methods. This\npioneering work establishes a foundation for more reliable evaluation practices\nin the realm of post-hoc explanation methods, emphasizing the importance of\nmodel robustness in the assessment process.\n","authors":["Tristan Gomez","Harold Mouchère"],"pdf_url":"https://arxiv.org/pdf/2311.17876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17874v1","updated":"2023-11-29T18:20:16Z","published":"2023-11-29T18:20:16Z","title":"FisherRF: Active View Selection and Uncertainty Quantification for\n Radiance Fields using Fisher Information","summary":" This study addresses the challenging problem of active view selection and\nuncertainty quantification within the domain of Radiance Fields. Neural\nRadiance Fields (NeRF) have greatly advanced image rendering and\nreconstruction, but the limited availability of 2D images poses uncertainties\nstemming from occlusions, depth ambiguities, and imaging errors. Efficiently\nselecting informative views becomes crucial, and quantifying NeRF model\nuncertainty presents intricate challenges. Existing approaches either depend on\nmodel architecture or are based on assumptions regarding density distributions\nthat are not generally applicable. By leveraging Fisher Information, we\nefficiently quantify observed information within Radiance Fields without ground\ntruth data. This can be used for the next best view selection and pixel-wise\nuncertainty quantification. Our method overcomes existing limitations on model\narchitecture and effectiveness, achieving state-of-the-art results in both view\nselection and uncertainty quantification, demonstrating its potential to\nadvance the field of Radiance Fields. Our method with the 3D Gaussian Splatting\nbackend could perform view selections at 70 fps.\n","authors":["Wen Jiang","Boshu Lei","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2311.17874v1.pdf","comment":"Project page: https://jiangwenpl.github.io/FisherRF/"},{"id":"http://arxiv.org/abs/2311.17857v1","updated":"2023-11-29T18:04:07Z","published":"2023-11-29T18:04:07Z","title":"Gaussian Shell Maps for Efficient 3D Human Generation","summary":" Efficient generation of 3D digital humans is important in several industries,\nincluding virtual reality, social media, and cinematic production. 3D\ngenerative adversarial networks (GANs) have demonstrated state-of-the-art\n(SOTA) quality and diversity for generated assets. Current 3D GAN\narchitectures, however, typically rely on volume representations, which are\nslow to render, thereby hampering the GAN training and requiring\nmulti-view-inconsistent 2D upsamplers. Here, we introduce Gaussian Shell Maps\n(GSMs) as a framework that connects SOTA generator network architectures with\nemerging 3D Gaussian rendering primitives using an articulable multi\nshell--based scaffold. In this setting, a CNN generates a 3D texture stack with\nfeatures that are mapped to the shells. The latter represent inflated and\ndeflated versions of a template surface of a digital human in a canonical body\npose. Instead of rasterizing the shells directly, we sample 3D Gaussians on the\nshells whose attributes are encoded in the texture features. These Gaussians\nare efficiently and differentiably rendered. The ability to articulate the\nshells is important during GAN training and, at inference time, to deform a\nbody into arbitrary user-defined poses. Our efficient rendering scheme bypasses\nthe need for view-inconsistent upsamplers and achieves high-quality multi-view\nconsistent renderings at a native resolution of $512 \\times 512$ pixels. We\ndemonstrate that GSMs successfully generate 3D humans when trained on\nsingle-view datasets, including SHHQ and DeepFashion.\n","authors":["Rameen Abdal","Wang Yifan","Zifan Shi","Yinghao Xu","Ryan Po","Zhengfei Kuang","Qifeng Chen","Dit-Yan Yeung","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2311.17857v1.pdf","comment":"Project page : https://rameenabdal.github.io/GaussianShellMaps/"},{"id":"http://arxiv.org/abs/2311.17851v1","updated":"2023-11-29T17:54:22Z","published":"2023-11-29T17:54:22Z","title":"Evaluating VLMs for Score-Based, Multi-Probe Annotation of 3D Objects","summary":" Unlabeled 3D objects present an opportunity to leverage pretrained vision\nlanguage models (VLMs) on a range of annotation tasks -- from describing object\nsemantics to physical properties. An accurate response must take into account\nthe full appearance of the object in 3D, various ways of phrasing the\nquestion/prompt, and changes in other factors that affect the response. We\npresent a method to marginalize over any factors varied across VLM queries,\nutilizing the VLM's scores for sampled responses. We first show that this\nprobabilistic aggregation can outperform a language model (e.g., GPT4) for\nsummarization, for instance avoiding hallucinations when there are contrasting\ndetails between responses. Secondly, we show that aggregated annotations are\nuseful for prompt-chaining; they help improve downstream VLM predictions (e.g.,\nof object material when the object's type is specified as an auxiliary input in\nthe prompt). Such auxiliary inputs allow ablating and measuring the\ncontribution of visual reasoning over language-only reasoning. Using these\nevaluations, we show how VLMs can approach, without additional training or\nin-context learning, the quality of human-verified type and material\nannotations on the large-scale Objaverse dataset.\n","authors":["Rishabh Kabra","Loic Matthey","Alexander Lerchner","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.17851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12775v2","updated":"2023-11-29T17:49:41Z","published":"2023-11-21T18:38:03Z","title":"SuGaR: Surface-Aligned Gaussian Splatting for Efficient 3D Mesh\n Reconstruction and High-Quality Mesh Rendering","summary":" We propose a method to allow precise and extremely fast mesh extraction from\n3D Gaussian Splatting. Gaussian Splatting has recently become very popular as\nit yields realistic rendering while being significantly faster to train than\nNeRFs. It is however challenging to extract a mesh from the millions of tiny 3D\ngaussians as these gaussians tend to be unorganized after optimization and no\nmethod has been proposed so far. Our first key contribution is a regularization\nterm that encourages the gaussians to align well with the surface of the scene.\nWe then introduce a method that exploits this alignment to extract a mesh from\nthe Gaussians using Poisson reconstruction, which is fast, scalable, and\npreserves details, in contrast to the Marching Cubes algorithm usually applied\nto extract meshes from Neural SDFs. Finally, we introduce an optional\nrefinement strategy that binds gaussians to the surface of the mesh, and\njointly optimizes these Gaussians and the mesh through Gaussian splatting\nrendering. This enables easy editing, sculpting, rigging, animating,\ncompositing and relighting of the Gaussians using traditional softwares by\nmanipulating the mesh instead of the gaussians themselves. Retrieving such an\neditable mesh for realistic rendering is done within minutes with our method,\ncompared to hours with the state-of-the-art methods on neural SDFs, while\nproviding a better rendering quality. Our project page is the following:\nhttps://imagine.enpc.fr/~guedona/sugar/\n","authors":["Antoine Guédon","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2311.12775v2.pdf","comment":"We identified a minor typographical error in Equation 6; We updated\n the paper accordingly. Project Webpage:\n https://imagine.enpc.fr/~guedona/sugar/"},{"id":"http://arxiv.org/abs/2311.17846v1","updated":"2023-11-29T17:49:33Z","published":"2023-11-29T17:49:33Z","title":"Towards Real-World Focus Stacking with Deep Learning","summary":" Focus stacking is widely used in micro, macro, and landscape photography to\nreconstruct all-in-focus images from multiple frames obtained with focus\nbracketing, that is, with shallow depth of field and different focus planes.\nExisting deep learning approaches to the underlying multi-focus image fusion\nproblem have limited applicability to real-world imagery since they are\ndesigned for very short image sequences (two to four images), and are typically\ntrained on small, low-resolution datasets either acquired by light-field\ncameras or generated synthetically. We introduce a new dataset consisting of 94\nhigh-resolution bursts of raw images with focus bracketing, with pseudo ground\ntruth computed from the data using state-of-the-art commercial software. This\ndataset is used to train the first deep learning algorithm for focus stacking\ncapable of handling bursts of sufficient length for real-world applications.\nQualitative experiments demonstrate that it is on par with existing commercial\nsolutions in the long-burst, realistic regime while being significantly more\ntolerant to noise. The code and dataset are available at\nhttps://github.com/araujoalexandre/FocusStackingDataset.\n","authors":["Alexandre Araujo","Jean Ponce","Julien Mairal"],"pdf_url":"https://arxiv.org/pdf/2311.17846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17842v1","updated":"2023-11-29T17:46:25Z","published":"2023-11-29T17:46:25Z","title":"Look Before You Leap: Unveiling the Power of GPT-4V in Robotic\n Vision-Language Planning","summary":" In this study, we are interested in imbuing robots with the capability of\nphysically-grounded task planning. Recent advancements have shown that large\nlanguage models (LLMs) possess extensive knowledge useful in robotic tasks,\nespecially in reasoning and planning. However, LLMs are constrained by their\nlack of world grounding and dependence on external affordance models to\nperceive environmental information, which cannot jointly reason with LLMs. We\nargue that a task planner should be an inherently grounded, unified multimodal\nsystem. To this end, we introduce Robotic Vision-Language Planning (ViLa), a\nnovel approach for long-horizon robotic planning that leverages vision-language\nmodels (VLMs) to generate a sequence of actionable steps. ViLa directly\nintegrates perceptual data into its reasoning and planning process, enabling a\nprofound understanding of commonsense knowledge in the visual world, including\nspatial layouts and object attributes. It also supports flexible multimodal\ngoal specification and naturally incorporates visual feedback. Our extensive\nevaluation, conducted in both real-robot and simulated environments,\ndemonstrates ViLa's superiority over existing LLM-based planners, highlighting\nits effectiveness in a wide array of open-world manipulation tasks.\n","authors":["Yingdong Hu","Fanqi Lin","Tong Zhang","Li Yi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2311.17842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17834v1","updated":"2023-11-29T17:36:49Z","published":"2023-11-29T17:36:49Z","title":"SPiC-E : Structural Priors in 3D Diffusion Models using Cross Entity\n Attention","summary":" We are witnessing rapid progress in automatically generating and manipulating\n3D assets due to the availability of pretrained text-image diffusion models.\nHowever, time-consuming optimization procedures are required for synthesizing\neach sample, hindering their potential for democratizing 3D content creation.\nConversely, 3D diffusion models now train on million-scale 3D datasets,\nyielding high-quality text-conditional 3D samples within seconds. In this work,\nwe present SPiC-E - a neural network that adds structural guidance to 3D\ndiffusion models, extending their usage beyond text-conditional generation. At\nits core, our framework introduces a cross-entity attention mechanism that\nallows for multiple entities (in particular, paired input and guidance 3D\nshapes) to interact via their internal representations within the denoising\nnetwork. We utilize this mechanism for learning task-specific structural priors\nin 3D diffusion models from auxiliary guidance shapes. We show that our\napproach supports a variety of applications, including 3D stylization, semantic\nshape editing and text-conditional abstraction-to-3D, which transforms\nprimitive-based abstractions into highly-expressive shapes. Extensive\nexperiments demonstrate that SPiC-E achieves SOTA performance over these tasks\nwhile often being considerably faster than alternative methods. Importantly,\nthis is accomplished without tailoring our approach for any specific task.\n","authors":["Etai Sella","Gal Fiebelman","Noam Atia","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2311.17834v1.pdf","comment":"Project webpage: https://tau-vailab.github.io/spic-e"},{"id":"http://arxiv.org/abs/2311.17833v1","updated":"2023-11-29T17:35:29Z","published":"2023-11-29T17:35:29Z","title":"Analyzing and Explaining Image Classifiers via Diffusion Guidance","summary":" While deep learning has led to huge progress in complex image classification\ntasks like ImageNet, unexpected failure modes, e.g. via spurious features, call\ninto question how reliably these classifiers work in the wild. Furthermore, for\nsafety-critical tasks the black-box nature of their decisions is problematic,\nand explanations or at least methods which make decisions plausible are needed\nurgently. In this paper, we address these problems by generating images that\noptimize a classifier-derived objective using a framework for guided image\ngeneration. We analyze the behavior and decisions of image classifiers by\nvisual counterfactual explanations (VCEs), detection of systematic mistakes by\nanalyzing images where classifiers maximally disagree, and visualization of\nneurons to verify potential spurious features. In this way, we validate\nexisting observations, e.g. the shape bias of adversarially robust models, as\nwell as novel failure modes, e.g. systematic errors of zero-shot CLIP\nclassifiers, or identify harmful spurious features. Moreover, our VCEs\noutperform previous work while being more versatile.\n","authors":["Maximilian Augustin","Yannic Neuhaus","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2311.17833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17812v1","updated":"2023-11-29T17:03:37Z","published":"2023-11-29T17:03:37Z","title":"DAP: Domain-aware Prompt Learning for Vision-and-Language Navigation","summary":" Following language instructions to navigate in unseen environments is a\nchallenging task for autonomous embodied agents. With strong representation\ncapabilities, pretrained vision-and-language models are widely used in VLN.\nHowever, most of them are trained on web-crawled general-purpose datasets,\nwhich incurs a considerable domain gap when used for VLN tasks. To address the\nproblem, we propose a novel and model-agnostic domain-aware prompt learning\n(DAP) framework. For equipping the pretrained models with specific object-level\nand scene-level cross-modal alignment in VLN tasks, DAP applies a low-cost\nprompt tuning paradigm to learn soft visual prompts for extracting in-domain\nimage semantics. Specifically, we first generate a set of in-domain image-text\npairs with the help of the CLIP model. Then we introduce soft visual prompts in\nthe input space of the visual encoder in a pretrained model. DAP injects\nin-domain visual knowledge into the visual encoder of the pretrained model in\nan efficient way. Experimental results on both R2R and REVERIE show the\nsuperiority of DAP compared to existing state-of-the-art methods.\n","authors":["Ting Liu","Yue Hu","Wansen Wu","Youkai Wang","Kai Xu","Quanjun Yin"],"pdf_url":"https://arxiv.org/pdf/2311.17812v1.pdf","comment":"4 pages. arXiv admin note: substantial text overlap with\n arXiv:2309.03661"},{"id":"http://arxiv.org/abs/2311.07184v2","updated":"2023-11-29T17:01:00Z","published":"2023-11-13T09:19:14Z","title":"Cross-Axis Transformer with 2D Rotary Embeddings","summary":" Despite lagging behind their modal cousins in many respects, Vision\nTransformers have provided an interesting opportunity to bridge the gap between\nsequence modeling and image modeling. Up until now however, vision transformers\nhave largely been held back, due to both computational inefficiency, and lack\nof proper handling of spatial dimensions. In this paper, we introduce the\nCross-Axis Transformer. CAT is a model inspired by both Axial Transformers, and\nMicrosoft's recent Retentive Network, that drastically reduces the required\nnumber of floating point operations required to process an image, while\nsimultaneously converging faster and more accurately than the Vision\nTransformers it replaces.\n","authors":["Lily Erickson"],"pdf_url":"https://arxiv.org/pdf/2311.07184v2.pdf","comment":"7 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.17810v1","updated":"2023-11-29T16:59:45Z","published":"2023-11-29T16:59:45Z","title":"Coloring the Past: Neural Historical Buildings Reconstruction from\n Archival Photography","summary":" Historical buildings are a treasure and milestone of human cultural heritage.\nReconstructing the 3D models of these building hold significant value. The\nrapid development of neural rendering methods makes it possible to recover the\n3D shape only based on archival photographs. However, this task presents\nconsiderable challenges due to the limitations of such datasets. Historical\nphotographs are often limited in number and the scenes in these photos might\nhave altered over time. The radiometric quality of these images is also often\nsub-optimal. To address these challenges, we introduce an approach to\nreconstruct the geometry of historical buildings, employing volumetric\nrendering techniques. We leverage dense point clouds as a geometric prior and\nintroduce a color appearance embedding loss to recover the color of the\nbuilding given limited available color images. We aim for our work to spark\nincreased interest and focus on preserving historical buildings. Thus, we also\nintroduce a new historical dataset of the Hungarian National Theater, providing\na new benchmark for the reconstruction method.\n","authors":["David Komorowicz","Lu Sang","Ferdinand Maiwald","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2311.17810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17804v1","updated":"2023-11-29T16:54:25Z","published":"2023-11-29T16:54:25Z","title":"Aggregation Model Hyperparameters Matter in Digital Pathology","summary":" Digital pathology has significantly advanced disease detection and\npathologist efficiency through the analysis of gigapixel whole-slide images\n(WSI). In this process, WSIs are first divided into patches, for which a\nfeature extractor model is applied to obtain feature vectors, which are\nsubsequently processed by an aggregation model to predict the respective WSI\nlabel. With the rapid evolution of representation learning, numerous new\nfeature extractor models, often termed foundational models, have emerged.\nTraditional evaluation methods, however, rely on fixed aggregation model\nhyperparameters, a framework we identify as potentially biasing the results.\nOur study uncovers a co-dependence between feature extractor models and\naggregation model hyperparameters, indicating that performance comparability\ncan be skewed based on the chosen hyperparameters. By accounting for this\nco-dependency, we find that the performance of many current feature extractor\nmodels is notably similar. We support this insight by evaluating seven feature\nextractor models across three different datasets with 162 different aggregation\nmodel configurations. This comprehensive approach provides a more nuanced\nunderstanding of the relationship between feature extractors and aggregation\nmodels, leading to a fairer and more accurate assessment of feature extractor\nmodels in digital pathology.\n","authors":["Gustav Bredell","Marcel Fischer","Przemyslaw Szostak","Samaneh Abbasi-Sureshjani","Alvaro Gomariz"],"pdf_url":"https://arxiv.org/pdf/2311.17804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17791v1","updated":"2023-11-29T16:35:24Z","published":"2023-11-29T16:35:24Z","title":"U-Net v2: Rethinking the Skip Connections of U-Net for Medical Image\n Segmentation","summary":" In this paper, we introduce U-Net v2, a new robust and efficient U-Net\nvariant for medical image segmentation. It aims to augment the infusion of\nsemantic information into low-level features while simultaneously refining\nhigh-level features with finer details. For an input image, we begin by\nextracting multi-level features with a deep neural network encoder. Next, we\nenhance the feature map of each level by infusing semantic information from\nhigher-level features and integrating finer details from lower-level features\nthrough Hadamard product. Our novel skip connections empower features of all\nthe levels with enriched semantic characteristics and intricate details. The\nimproved features are subsequently transmitted to the decoder for further\nprocessing and segmentation. Our method can be seamlessly integrated into any\nEncoder-Decoder network. We evaluate our method on several public medical image\nsegmentation datasets for skin lesion segmentation and polyp segmentation, and\nthe experimental results demonstrate the segmentation accuracy of our new\nmethod over state-of-the-art methods, while preserving memory and computational\nefficiency. Code is available at: https://github.com/yaoppeng/U-Net\\_v2\n","authors":["Yaopeng Peng","Milan Sonka","Danny Z. Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01406v2","updated":"2023-11-29T16:23:33Z","published":"2023-10-02T17:59:17Z","title":"HumanNorm: Learning Normal Diffusion Model for High-quality and\n Realistic 3D Human Generation","summary":" Recent text-to-3D methods employing diffusion models have made significant\nadvancements in 3D human generation. However, these approaches face challenges\ndue to the limitations of text-to-image diffusion models, which lack an\nunderstanding of 3D structures. Consequently, these methods struggle to achieve\nhigh-quality human generation, resulting in smooth geometry and cartoon-like\nappearances. In this paper, we propose HumanNorm, a novel approach for\nhigh-quality and realistic 3D human generation. The main idea is to enhance the\nmodel's 2D perception of 3D geometry by learning a normal-adapted diffusion\nmodel and a normal-aligned diffusion model. The normal-adapted diffusion model\ncan generate high-fidelity normal maps corresponding to user prompts with\nview-dependent and body-aware text. The normal-aligned diffusion model learns\nto generate color images aligned with the normal maps, thereby transforming\nphysical geometry details into realistic appearance. Leveraging the proposed\nnormal diffusion model, we devise a progressive geometry generation strategy\nand a multi-step Score Distillation Sampling (SDS) loss to enhance the\nperformance of 3D human generation. Comprehensive experiments substantiate\nHumanNorm's ability to generate 3D humans with intricate geometry and realistic\nappearances. HumanNorm outperforms existing text-to-3D methods in both geometry\nand texture quality. The project page of HumanNorm is\nhttps://humannorm.github.io/.\n","authors":["Xin Huang","Ruizhi Shao","Qi Zhang","Hongwen Zhang","Ying Feng","Yebin Liu","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01406v2.pdf","comment":"The project page of HumanNorm is https://humannorm.github.io/"},{"id":"http://arxiv.org/abs/2311.17776v1","updated":"2023-11-29T16:23:06Z","published":"2023-11-29T16:23:06Z","title":"One-Shot Open Affordance Learning with Foundation Models","summary":" We introduce One-shot Open Affordance Learning (OOAL), where a model is\ntrained with just one example per base object category, but is expected to\nidentify novel objects and affordances. While vision-language models excel at\nrecognizing novel objects and scenes, they often struggle to understand finer\nlevels of granularity such as affordances. To handle this issue, we conduct a\ncomprehensive analysis of existing foundation models, to explore their inherent\nunderstanding of affordances and assess the potential for data-limited\naffordance learning. We then propose a vision-language framework with simple\nand effective designs that boost the alignment between visual features and\naffordance text embeddings. Experiments on two affordance segmentation\nbenchmarks show that the proposed method outperforms state-of-the-art models\nwith less than 1% of the full training data, and exhibits reasonable\ngeneralization capability on unseen objects and affordances.\n","authors":["Gen Li","Deqing Sun","Laura Sevilla-Lara","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2311.17776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17770v1","updated":"2023-11-29T16:11:33Z","published":"2023-11-29T16:11:33Z","title":"PillarNeSt: Embracing Backbone Scaling and Pretraining for Pillar-based\n 3D Object Detection","summary":" This paper shows the effectiveness of 2D backbone scaling and pretraining for\npillar-based 3D object detectors. Pillar-based methods mainly employ randomly\ninitialized 2D convolution neural network (ConvNet) for feature extraction and\nfail to enjoy the benefits from the backbone scaling and pretraining in the\nimage domain. To show the scaling-up capacity in point clouds, we introduce the\ndense ConvNet pretrained on large-scale image datasets (e.g., ImageNet) as the\n2D backbone of pillar-based detectors. The ConvNets are adaptively designed\nbased on the model size according to the specific features of point clouds,\nsuch as sparsity and irregularity. Equipped with the pretrained ConvNets, our\nproposed pillar-based detector, termed PillarNeSt, outperforms the existing 3D\nobject detectors by a large margin on the nuScenes and Argoversev2 datasets.\nOur code shall be released upon acceptance.\n","authors":["Weixin Mao","Tiancai Wang","Diankun Zhang","Junjie Yan","Osamu Yoshie"],"pdf_url":"https://arxiv.org/pdf/2311.17770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17754v1","updated":"2023-11-29T15:56:58Z","published":"2023-11-29T15:56:58Z","title":"Cinematic Behavior Transfer via NeRF-based Differentiable Filming","summary":" In the evolving landscape of digital media and video production, the precise\nmanipulation and reproduction of visual elements like camera movements and\ncharacter actions are highly desired. Existing SLAM methods face limitations in\ndynamic scenes and human pose estimation often focuses on 2D projections,\nneglecting 3D statuses. To address these issues, we first introduce a reverse\nfilming behavior estimation technique. It optimizes camera trajectories by\nleveraging NeRF as a differentiable renderer and refining SMPL tracks. We then\nintroduce a cinematic transfer pipeline that is able to transfer various shot\ntypes to a new 2D video or a 3D virtual environment. The incorporation of 3D\nengine workflow enables superior rendering and control abilities, which also\nachieves a higher rating in the user study.\n","authors":["Xuekun Jiang","Anyi Rao","Jingbo Wang","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2311.17754v1.pdf","comment":"Project Page:\n https://virtualfilmstudio.github.io/projects/cinetransfer"},{"id":"http://arxiv.org/abs/2311.16854v2","updated":"2023-11-29T15:56:38Z","published":"2023-11-28T15:03:53Z","title":"A Unified Approach for Text- and Image-guided 4D Scene Generation","summary":" Large-scale diffusion generative models are greatly simplifying image, video\nand 3D asset creation from user-provided text prompts and images. However, the\nchallenging problem of text-to-4D dynamic 3D scene generation with diffusion\nguidance remains largely unexplored. We propose Dream-in-4D, which features a\nnovel two-stage approach for text-to-4D synthesis, leveraging (1) 3D and 2D\ndiffusion guidance to effectively learn a high-quality static 3D asset in the\nfirst stage; (2) a deformable neural radiance field that explicitly\ndisentangles the learned static asset from its deformation, preserving quality\nduring motion learning; and (3) a multi-resolution feature grid for the\ndeformation field with a displacement total variation loss to effectively learn\nmotion with video diffusion guidance in the second stage. Through a user\npreference study, we demonstrate that our approach significantly advances image\nand motion quality, 3D consistency and text fidelity for text-to-4D generation\ncompared to baseline approaches. Thanks to its motion-disentangled\nrepresentation, Dream-in-4D can also be easily adapted for controllable\ngeneration where appearance is defined by one or multiple images, without the\nneed to modify the motion learning stage. Thus, our method offers, for the\nfirst time, a unified approach for text-to-4D, image-to-4D and personalized 4D\ngeneration tasks.\n","authors":["Yufeng Zheng","Xueting Li","Koki Nagano","Sifei Liu","Karsten Kreis","Otmar Hilliges","Shalini De Mello"],"pdf_url":"https://arxiv.org/pdf/2311.16854v2.pdf","comment":"Project page: https://research.nvidia.com/labs/nxp/dream-in-4d/"},{"id":"http://arxiv.org/abs/2311.17752v1","updated":"2023-11-29T15:56:31Z","published":"2023-11-29T15:56:31Z","title":"BAND-2k: Banding Artifact Noticeable Database for Banding Detection and\n Quality Assessment","summary":" Banding, also known as staircase-like contours, frequently occurs in flat\nareas of images/videos processed by the compression or quantization algorithms.\nAs undesirable artifacts, banding destroys the original image structure, thus\ndegrading users' quality of experience (QoE). In this paper, we systematically\ninvestigate the banding image quality assessment (IQA) problem, aiming to\ndetect the image banding artifacts and evaluate their perceptual visual\nquality. Considering that the existing image banding databases only contain\nlimited content sources and banding generation methods, and lack perceptual\nquality labels (i.e. mean opinion scores), we first build the largest banding\nIQA database so far, named Banding Artifact Noticeable Database (BAND-2k),\nwhich consists of 2,000 banding images generated by 15 compression and\nquantization schemes. A total of 23 workers participated in the subjective IQA\nexperiment, yielding over 214,000 patch-level banding class labels and 44,371\nreliable image-level quality ratings. Subsequently, we develop an effective\nno-reference (NR) banding evaluator for banding detection and quality\nassessment by leveraging frequency characteristics of banding artifacts. A dual\nconvolutional neural network is employed to concurrently learn the feature\nrepresentation from the high-frequency and low-frequency maps, thereby\nenhancing the ability to discern banding artifacts. The quality score of a\nbanding image is generated by pooling the banding detection maps masked by the\nspatial frequency filters. Experiments demonstrate that our banding evaluator\nachieves a remarkably high accuracy in banding detection and also exhibits high\nSRCC and PLCC results with the perceptual quality labels. These findings unveil\nthe strong correlations between the intensity of banding artifacts and the\nperceptual visual quality, thus validating the necessity of banding quality\nassessment.\n","authors":["Zijian Chen","Wei Sun","Jun Jia","Fangfang Lu","Zicheng Zhang","Jing Liu","Ru Huang","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2311.17752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17041v2","updated":"2023-11-29T15:52:55Z","published":"2023-11-28T18:53:06Z","title":"Efficient In-Context Learning in Vision-Language Models for Egocentric\n Videos","summary":" Recent advancements in text-only large language models (LLMs) have\nhighlighted the benefit of in-context learning for adapting to new tasks with a\nfew demonstrations. However, extending in-context learning to large\nvision-language models (VLMs) using a huge amount of naturalistic\nvision-language data has shown limited success, particularly for egocentric\nvideos, due to high data collection costs. We propose a novel training method\n$\\mathbb{E}$fficient $\\mathbb{I}$n-context $\\mathbb{L}$earning on\n$\\mathbb{E}$gocentric $\\mathbb{V}$ideos ($\\mathbb{EILEV}$), which elicits\nin-context learning in VLMs for egocentric videos without requiring massive,\nnaturalistic egocentric video datasets. $\\mathbb{EILEV}$ involves architectural\nand training data adaptations to allow the model to process contexts\ninterleaved with video clips and narrations, sampling of in-context examples\nwith clusters of similar verbs and nouns, use of data with skewed marginal\ndistributions with a long tail of infrequent verbs and nouns, as well as\nhomonyms and synonyms. Our evaluations show that $\\mathbb{EILEV}$-trained\nmodels outperform larger VLMs trained on a huge amount of naturalistic data in\nin-context learning. Furthermore, they can generalize to not only\nout-of-distribution, but also novel, rare egocentric videos and texts via\nin-context learning, demonstrating potential for applications requiring\ncost-effective training, and rapid post-deployment adaptability. Our code and\ndemo are available at \\url{https://github.com/yukw777/EILEV}.\n","authors":["Keunwoo Peter Yu","Zheyuan Zhang","Fengyuan Hu","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2311.17041v2.pdf","comment":"10 pages, LaTeX; added acknowledgments"},{"id":"http://arxiv.org/abs/2311.17744v1","updated":"2023-11-29T15:49:31Z","published":"2023-11-29T15:49:31Z","title":"Variational Bayes image restoration with compressive autoencoders","summary":" Regularization of inverse problems is of paramount importance in\ncomputational imaging. The ability of neural networks to learn efficient image\nrepresentations has been recently exploited to design powerful data-driven\nregularizers. While state-of-the-art plug-and-play methods rely on an implicit\nregularization provided by neural denoisers, alternative Bayesian approaches\nconsider Maximum A Posteriori (MAP) estimation in the latent space of a\ngenerative model, thus with an explicit regularization. However,\nstate-of-the-art deep generative models require a huge amount of training data\ncompared to denoisers. Besides, their complexity hampers the optimization of\nthe latent MAP. In this work, we propose to use compressive autoencoders for\nlatent estimation. These networks, which can be seen as variational\nautoencoders with a flexible latent prior, are smaller and easier to train than\nstate-of-the-art generative models. We then introduce the Variational Bayes\nLatent Estimation (VBLE) algorithm, which performs this estimation within the\nframework of variational inference. This allows for fast and easy (approximate)\nposterior sampling. Experimental results on image datasets BSD and FFHQ\ndemonstrate that VBLE reaches similar performance than state-of-the-art\nplug-and-play methods, while being able to quantify uncertainties faster than\nother existing posterior sampling techniques.\n","authors":["Maud Biquard","Marie Chabert","Thomas Oberlin"],"pdf_url":"https://arxiv.org/pdf/2311.17744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17737v1","updated":"2023-11-29T15:40:11Z","published":"2023-11-29T15:40:11Z","title":"GenZI: Zero-Shot 3D Human-Scene Interaction Generation","summary":" Can we synthesize 3D humans interacting with scenes without learning from any\n3D human-scene interaction data? We propose GenZI, the first zero-shot approach\nto generating 3D human-scene interactions. Key to GenZI is our distillation of\ninteraction priors from large vision-language models (VLMs), which have learned\na rich semantic space of 2D human-scene compositions. Given a natural language\ndescription and a coarse point location of the desired interaction in a 3D\nscene, we first leverage VLMs to imagine plausible 2D human interactions\ninpainted into multiple rendered views of the scene. We then formulate a robust\niterative optimization to synthesize the pose and shape of a 3D human model in\nthe scene, guided by consistency with the 2D interaction hypotheses. In\ncontrast to existing learning-based approaches, GenZI circumvents the\nconventional need for captured 3D interaction data, and allows for flexible\ncontrol of the 3D interaction synthesis with easy-to-use text prompts.\nExtensive experiments show that our zero-shot approach has high flexibility and\ngenerality, making it applicable to diverse scene types, including both indoor\nand outdoor environments.\n","authors":["Lei Li","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2311.17737v1.pdf","comment":"Project page: https://craigleili.github.io/projects/genzi/ Video:\n https://youtu.be/ozfs6E0JIMY"},{"id":"http://arxiv.org/abs/2311.07574v2","updated":"2023-11-29T15:37:24Z","published":"2023-11-13T18:59:31Z","title":"To See is to Believe: Prompting GPT-4V for Better Visual Instruction\n Tuning","summary":" Existing visual instruction tuning methods typically prompt large language\nmodels with textual descriptions to generate instruction-following data.\nDespite the promising performance achieved, these descriptions are derived from\nimage annotations, which are oftentimes coarse-grained. Furthermore, the\ninstructions might even contradict the visual content without observing the\nentire visual context. To address this challenge, we introduce a fine-grained\nvisual instruction dataset, LVIS-Instruct4V, which contains 220K visually\naligned and context-aware instructions produced by prompting the powerful\nGPT-4V with images from LVIS. Through experimental validation and case studies,\nwe demonstrate that high-quality visual instructional data could improve the\nperformance of LLaVA-1.5, a state-of-the-art large multimodal model, across a\nwide spectrum of benchmarks by clear margins. Notably, by simply replacing the\nLLaVA-Instruct with our LVIS-Instruct4V, we achieve better results than LLaVA\non most challenging LMM benchmarks, e.g., LLaVA$^w$ (76.7 vs. 70.7) and MM-Vet\n(40.2 vs. 35.4). We release our data and model at\nhttps://github.com/X2FD/LVIS-INSTRUCT4V.\n","authors":["Junke Wang","Lingchen Meng","Zejia Weng","Bo He","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.07574v2.pdf","comment":"techical report; work in progress"},{"id":"http://arxiv.org/abs/2311.13562v2","updated":"2023-11-29T15:24:35Z","published":"2023-11-22T18:15:43Z","title":"Soulstyler: Using Large Language Model to Guide Image Style Transfer for\n Target Object","summary":" Image style transfer occupies an important place in both computer graphics\nand computer vision. However, most current methods require reference to\nstylized images and cannot individually stylize specific objects. To overcome\nthis limitation, we propose the \"Soulstyler\" framework, which allows users to\nguide the stylization of specific objects in an image through simple textual\ndescriptions. We introduce a large language model to parse the text and\nidentify stylization goals and specific styles. Combined with a CLIP-based\nsemantic visual embedding encoder, the model understands and matches text and\nimage content. We also introduce a novel localized text-image block matching\nloss that ensures that style transfer is performed only on specified target\nobjects, while non-target regions remain in their original style. Experimental\nresults demonstrate that our model is able to accurately perform style transfer\non target objects according to textual descriptions without affecting the style\nof background regions. Our code will be available at\nhttps://github.com/yisuanwang/Soulstyler.\n","authors":["Junhao Chen","Peng Rong","Jingbo Sun","Chao Li","Xiang Li","Hongwu Lv"],"pdf_url":"https://arxiv.org/pdf/2311.13562v2.pdf","comment":"5 pages,3 figures,ICASSP2024"},{"id":"http://arxiv.org/abs/2311.00213v2","updated":"2023-11-29T15:21:58Z","published":"2023-11-01T01:20:12Z","title":"Consistent Video-to-Video Transfer Using Synthetic Dataset","summary":" We introduce a novel and efficient approach for text-based video-to-video\nediting that eliminates the need for resource-intensive per-video-per-model\nfinetuning. At the core of our approach is a synthetic paired video dataset\ntailored for video-to-video transfer tasks. Inspired by Instruct Pix2Pix's\nimage transfer via editing instruction, we adapt this paradigm to the video\ndomain. Extending the Prompt-to-Prompt to videos, we efficiently generate\npaired samples, each with an input video and its edited counterpart. Alongside\nthis, we introduce the Long Video Sampling Correction during sampling, ensuring\nconsistent long videos across batches. Our method surpasses current methods\nlike Tune-A-Video, heralding substantial progress in text-based video-to-video\nediting and suggesting exciting avenues for further exploration and deployment.\n","authors":["Jiaxin Cheng","Tianjun Xiao","Tong He"],"pdf_url":"https://arxiv.org/pdf/2311.00213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17717v1","updated":"2023-11-29T15:19:49Z","published":"2023-11-29T15:19:49Z","title":"Receler: Reliable Concept Erasing of Text-to-Image Diffusion Models via\n Lightweight Erasers","summary":" Concept erasure in text-to-image diffusion models aims to disable pre-trained\ndiffusion models from generating images related to a target concept. To perform\nreliable concept erasure, the properties of robustness and locality are\ndesirable. The former refrains the model from producing images associated with\nthe target concept for any paraphrased or learned prompts, while the latter\npreserves the model ability in generating images for non-target concepts. In\nthis paper, we propose Reliable Concept Erasing via Lightweight Erasers\n(Receler), which learns a lightweight Eraser to perform concept erasing and\nenhances locality and robustness with the proposed concept-localized\nregularization and adversarial prompt learning, respectively. Comprehensive\nquantitative and qualitative experiments with various concept prompts verify\nthe superiority of Receler over the previous erasing methods on the above two\ndesirable properties.\n","authors":["Chi-Pin Huang","Kai-Po Chang","Chung-Ting Tsai","Yung-Hsuan Lai","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12754v2","updated":"2023-11-29T15:19:38Z","published":"2023-11-21T17:59:14Z","title":"SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction","summary":" 3D occupancy prediction is an important task for the robustness of\nvision-centric autonomous driving, which aims to predict whether each point is\noccupied in the surrounding 3D space. Existing methods usually require 3D\noccupancy labels to produce meaningful results. However, it is very laborious\nto annotate the occupancy status of each voxel. In this paper, we propose\nSelfOcc to explore a self-supervised way to learn 3D occupancy using only video\nsequences. We first transform the images into the 3D space (e.g., bird's eye\nview) to obtain 3D representation of the scene. We directly impose constraints\non the 3D representations by treating them as signed distance fields. We can\nthen render 2D images of previous and future frames as self-supervision signals\nto learn the 3D representations. We propose an MVS-embedded strategy to\ndirectly optimize the SDF-induced weights with multiple depth proposals. Our\nSelfOcc outperforms the previous best method SceneRF by 58.7% using a single\nframe as input on SemanticKITTI and is the first self-supervised work that\nproduces reasonable 3D occupancy for surround cameras on nuScenes. SelfOcc\nproduces high-quality depth and achieves state-of-the-art results on novel\ndepth synthesis, monocular depth estimation, and surround-view depth estimation\non the SemanticKITTI, KITTI-2015, and nuScenes, respectively. Code:\nhttps://github.com/huang-yh/SelfOcc.\n","authors":["Yuanhui Huang","Wenzhao Zheng","Borui Zhang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2311.12754v2.pdf","comment":"Code is available at: https://github.com/huang-yh/SelfOcc"},{"id":"http://arxiv.org/abs/2311.17707v1","updated":"2023-11-29T15:11:03Z","published":"2023-11-29T15:11:03Z","title":"SAMPro3D: Locating SAM Prompts in 3D for Zero-Shot Scene Segmentation","summary":" We introduce SAMPro3D for zero-shot 3D indoor scene segmentation. Given the\n3D point cloud and multiple posed 2D frames of 3D scenes, our approach segments\n3D scenes by applying the pretrained Segment Anything Model (SAM) to 2D frames.\nOur key idea involves locating 3D points in scenes as natural 3D prompts to\nalign their projected pixel prompts across frames, ensuring frame-consistency\nin both pixel prompts and their SAM-predicted masks. Moreover, we suggest\nfiltering out low-quality 3D prompts based on feedback from all 2D frames, for\nenhancing segmentation quality. We also propose to consolidate different 3D\nprompts if they are segmenting the same object, bringing a more comprehensive\nsegmentation. Notably, our method does not require any additional training on\ndomain-specific data, enabling us to preserve the zero-shot power of SAM.\nExtensive qualitative and quantitative results show that our method\nconsistently achieves higher quality and more diverse segmentation than\nprevious zero-shot or fully supervised approaches, and in many cases even\nsurpasses human-level annotations. The project page can be accessed at\nhttps://mutianxu.github.io/sampro3d/.\n","authors":["Mutian Xu","Xingyilang Yin","Lingteng Qiu","Yang Liu","Xin Tong","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2311.17707v1.pdf","comment":"Project page: https://mutianxu.github.io/sampro3d/"},{"id":"http://arxiv.org/abs/2311.17695v1","updated":"2023-11-29T15:02:01Z","published":"2023-11-29T15:02:01Z","title":"Fair Text-to-Image Diffusion via Fair Mapping","summary":" In this paper, we address the limitations of existing text-to-image diffusion\nmodels in generating demographically fair results when given human-related\ndescriptions. These models often struggle to disentangle the target language\ncontext from sociocultural biases, resulting in biased image generation. To\novercome this challenge, we propose Fair Mapping, a general, model-agnostic,\nand lightweight approach that modifies a pre-trained text-to-image model by\ncontrolling the prompt to achieve fair image generation. One key advantage of\nour approach is its high efficiency. The training process only requires\nupdating a small number of parameters in an additional linear mapping network.\nThis not only reduces the computational cost but also accelerates the\noptimization process. We first demonstrate the issue of bias in generated\nresults caused by language biases in text-guided diffusion models. By\ndeveloping a mapping network that projects language embeddings into an unbiased\nspace, we enable the generation of relatively balanced demographic results\nbased on a keyword specified in the prompt. With comprehensive experiments on\nface image generation, we show that our method significantly improves image\ngeneration performance when prompted with descriptions related to human faces.\nBy effectively addressing the issue of bias, we produce more fair and diverse\nimage outputs. This work contributes to the field of text-to-image generation\nby enhancing the ability to generate images that accurately reflect the\nintended demographic characteristics specified in the text.\n","authors":["Jia Li","Lijie Hu","Jingfeng Zhang","Tianhang Zheng","Hua Zhang","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17693v1","updated":"2023-11-29T15:00:06Z","published":"2023-11-29T15:00:06Z","title":"Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using\n Reinforcement and Imitation Learning","summary":" Robotic-assisted surgical systems have demonstrated significant potential in\nenhancing surgical precision and minimizing human errors. However, existing\nsystems lack the ability to accommodate the unique preferences and requirements\nof individual surgeons. Additionally, they primarily focus on general surgeries\n(e.g., laparoscopy) and are not suitable for highly precise microsurgeries,\nsuch as ophthalmic procedures. Thus, we propose a simulation-based image-guided\napproach for surgeon-centered autonomous agents that can adapt to the\nindividual surgeon's skill level and preferred surgical techniques during\nophthalmic cataract surgery. Our approach utilizes a simulated environment to\ntrain reinforcement and imitation learning agents guided by image data to\nperform all tasks of the incision phase of cataract surgery. By integrating the\nsurgeon's actions and preferences into the training process with the\nsurgeon-in-the-loop, our approach enables the robot to implicitly learn and\nadapt to the individual surgeon's unique approach through demonstrations. This\nresults in a more intuitive and personalized surgical experience for the\nsurgeon. Simultaneously, it ensures consistent performance for the autonomous\nrobotic apprentice. We define and evaluate the effectiveness of our approach\nusing our proposed metrics; and highlight the trade-off between a generic agent\nand a surgeon-centered adapted agent. Moreover, our approach has the potential\nto extend to other ophthalmic surgical procedures, opening the door to a new\ngeneration of surgeon-in-the-loop autonomous surgical robots. We provide an\nopen-source simulation framework for future development and reproducibility.\n","authors":["Amr Gomaa","Bilal Mahdy","Niko Kleer","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2311.17693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17677v1","updated":"2023-11-29T14:40:31Z","published":"2023-11-29T14:40:31Z","title":"COVIDx CXR-4: An Expanded Multi-Institutional Open-Source Benchmark\n Dataset for Chest X-ray Image-Based Computer-Aided COVID-19 Diagnostics","summary":" The global ramifications of the COVID-19 pandemic remain significant,\nexerting persistent pressure on nations even three years after its initial\noutbreak. Deep learning models have shown promise in improving COVID-19\ndiagnostics but require diverse and larger-scale datasets to improve\nperformance. In this paper, we introduce COVIDx CXR-4, an expanded\nmulti-institutional open-source benchmark dataset for chest X-ray image-based\ncomputer-aided COVID-19 diagnostics. COVIDx CXR-4 expands significantly on the\nprevious COVIDx CXR-3 dataset by increasing the total patient cohort size by\ngreater than 2.66 times, resulting in 84,818 images from 45,342 patients across\nmultiple institutions. We provide extensive analysis on the diversity of the\npatient demographic, imaging metadata, and disease distributions to highlight\npotential dataset biases. To the best of the authors' knowledge, COVIDx CXR-4\nis the largest and most diverse open-source COVID-19 CXR dataset and is made\npublicly available as part of an open initiative to advance research to aid\nclinicians against the COVID-19 disease.\n","authors":["Yifan Wu","Hayden Gunraj","Chi-en Amy Tai","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2311.17677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17462v2","updated":"2023-11-29T14:33:28Z","published":"2023-10-26T15:10:10Z","title":"Towards Learning Monocular 3D Object Localization From 2D Labels using\n the Physical Laws of Motion","summary":" We present a novel method for precise 3D object localization in single images\nfrom a single calibrated camera using only 2D labels. No expensive 3D labels\nare needed. Thus, instead of using 3D labels, our model is trained with\neasy-to-annotate 2D labels along with the physical knowledge of the object's\nmotion. Given this information, the model can infer the latent third dimension,\neven though it has never seen this information during training. Our method is\nevaluated on both synthetic and real-world datasets, and we are able to achieve\na mean distance error of just 6 cm in our experiments on real data. The results\nindicate the method's potential as a step towards learning 3D object location\nestimation, where collecting 3D data for training is not feasible.\n","authors":["Daniel Kienzle","Julian Lorenz","Katja Ludwig","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2310.17462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19583v2","updated":"2023-11-29T14:33:09Z","published":"2023-10-30T14:41:53Z","title":"GC-MVSNet: Multi-View, Multi-Scale, Geometrically-Consistent Multi-View\n Stereo","summary":" Traditional multi-view stereo (MVS) methods rely heavily on photometric and\ngeometric consistency constraints, but newer machine learning-based MVS methods\ncheck geometric consistency across multiple source views only as a\npost-processing step. In this paper, we present a novel approach that\nexplicitly encourages geometric consistency of reference view depth maps across\nmultiple source views at different scales during learning (see Fig. 1). We find\nthat adding this geometric consistency loss significantly accelerates learning\nby explicitly penalizing geometrically inconsistent pixels, reducing the\ntraining iteration requirements to nearly half that of other MVS methods. Our\nextensive experiments show that our approach achieves a new state-of-the-art on\nthe DTU and BlendedMVS datasets, and competitive results on the Tanks and\nTemples benchmark. To the best of our knowledge, GC-MVSNet is the first attempt\nto enforce multi-view, multi-scale geometric consistency during learning.\n","authors":["Vibhas K. Vats","Sripad Joshi","David J. Crandall","Md. Alimoor Reza","Soon-heung Jung"],"pdf_url":"https://arxiv.org/pdf/2310.19583v2.pdf","comment":"Accepted in WACV 2024"},{"id":"http://arxiv.org/abs/2311.17663v1","updated":"2023-11-29T14:25:46Z","published":"2023-11-29T14:25:46Z","title":"Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in\n Autonomous Driving Applications","summary":" Understanding how the surrounding environment changes is crucial for\nperforming downstream tasks safely and reliably in autonomous driving\napplications. Recent occupancy estimation techniques using only camera images\nas input can provide dense occupancy representations of large-scale scenes\nbased on the current observation. However, they are mostly limited to\nrepresenting the current 3D space and do not consider the future state of\nsurrounding objects along the time axis. To extend camera-only occupancy\nestimation into spatiotemporal prediction, we propose Cam4DOcc, a new benchmark\nfor camera-only 4D occupancy forecasting, evaluating the surrounding scene\nchanges in a near future. We build our benchmark based on multiple publicly\navailable datasets, including nuScenes, nuScenes-Occupancy, and Lyft-Level5,\nwhich provides sequential occupancy states of general movable and static\nobjects, as well as their 3D backward centripetal flow. To establish this\nbenchmark for future research with comprehensive comparisons, we introduce four\nbaseline types from diverse camera-based perception and prediction\nimplementations, including a static-world occupancy model, voxelization of\npoint cloud prediction, 2D-3D instance-based prediction, and our proposed novel\nend-to-end 4D occupancy forecasting network. Furthermore, the standardized\nevaluation protocol for preset multiple tasks is also provided to compare the\nperformance of all the proposed baselines on present and future occupancy\nestimation with respect to objects of interest in autonomous driving scenarios.\nThe dataset and our implementation of all four baselines in the proposed\nCam4DOcc benchmark will be released here: https://github.com/haomo-ai/Cam4DOcc.\n","authors":["Junyi Ma","Xieyuanli Chen","Jiawei Huang","Jingyi Xu","Zhen Luo","Jintao Xu","Weihao Gu","Rui Ai","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05062v2","updated":"2023-11-29T14:23:09Z","published":"2023-05-08T21:38:42Z","title":"A Feasibility Study on Indoor Localization and Multi-person Tracking\n Using Sparsely Distributed Camera Network with Edge Computing","summary":" Camera-based activity monitoring systems are becoming an attractive solution\nfor smart building applications with the advances in computer vision and edge\ncomputing technologies. In this paper, we present a feasibility study and\nsystematic analysis of a camera-based indoor localization and multi-person\ntracking system implemented on edge computing devices within a large indoor\nspace. To this end, we deployed an end-to-end edge computing pipeline that\nutilizes multiple cameras to achieve localization, body orientation estimation\nand tracking of multiple individuals within a large therapeutic space spanning\n$1700m^2$, all while maintaining a strong focus on preserving privacy. Our\npipeline consists of 39 edge computing camera systems equipped with Tensor\nProcessing Units (TPUs) placed in the indoor space's ceiling. To ensure the\nprivacy of individuals, a real-time multi-person pose estimation algorithm runs\non the TPU of the computing camera system. This algorithm extracts poses and\nbounding boxes, which are utilized for indoor localization, body orientation\nestimation, and multi-person tracking. Our pipeline demonstrated an average\nlocalization error of 1.41 meters, a multiple-object tracking accuracy score of\n88.6\\%, and a mean absolute body orientation error of 29\\degree. These results\nshows that localization and tracking of individuals in a large indoor space is\nfeasible even with the privacy constrains.\n","authors":["Hyeokhyen Kwon","Chaitra Hegde","Yashar Kiarashi","Venkata Siva Krishna Madala","Ratan Singh","ArjunSinh Nakum","Robert Tweedy","Leandro Miletto Tonetto","Craig M. Zimring","Matthew Doiron","Amy D. Rodriguez","Allan I. Levey","Gari D. Clifford"],"pdf_url":"https://arxiv.org/pdf/2305.05062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17657v1","updated":"2023-11-29T14:19:40Z","published":"2023-11-29T14:19:40Z","title":"Volumetric Cloud Field Reconstruction","summary":" Volumetric phenomena, such as clouds and fog, present a significant challenge\nfor 3D reconstruction systems due to their translucent nature and their complex\ninteractions with light. Conventional techniques for reconstructing scattering\nvolumes rely on controlled setups, limiting practical applications. This paper\nintroduces an approach to reconstructing volumes from a few input stereo pairs.\nWe propose a novel deep learning framework that integrates a deep stereo model\nwith a 3D Convolutional Neural Network (3D CNN) and an advection module,\ncapable of capturing the shape and dynamics of volumes. The stereo depths are\nused to carve empty space around volumes, providing the 3D CNN with a prior for\ncoping with the lack of input views. Refining our output, the advection module\nleverages the temporal evolution of the medium, providing a mechanism to infer\nmotion and improve temporal consistency. The efficacy of our system is\ndemonstrated through its ability to estimate density and velocity fields of\nlarge-scale volumes, in this case, clouds, from a sparse set of stereo image\npairs.\n","authors":["Jacob Lin","Miguel Farinha","Edward Gryspeerdt","Ronald Clark"],"pdf_url":"https://arxiv.org/pdf/2311.17657v1.pdf","comment":"Project page at https://cloud-field.github.io"},{"id":"http://arxiv.org/abs/2311.17656v1","updated":"2023-11-29T14:19:14Z","published":"2023-11-29T14:19:14Z","title":"Multiple Toddler Tracking in Indoor Videos","summary":" Multiple toddler tracking (MTT) involves identifying and differentiating\ntoddlers in video footage. While conventional multi-object tracking (MOT)\nalgorithms are adept at tracking diverse objects, toddlers pose unique\nchallenges due to their unpredictable movements, various poses, and similar\nappearance. Tracking toddlers in indoor environments introduces additional\ncomplexities such as occlusions and limited fields of view. In this paper, we\naddress the challenges of MTT and propose MTTSort, a customized method built\nupon the DeepSort algorithm. MTTSort is designed to track multiple toddlers in\nindoor videos accurately. Our contributions include discussing the primary\nchallenges in MTT, introducing a genetic algorithm to optimize hyperparameters,\nproposing an accurate tracking algorithm, and curating the MTTrack dataset\nusing unbiased AI co-labeling techniques. We quantitatively compare MTTSort to\nstate-of-the-art MOT methods on MTTrack, DanceTrack, and MOT15 datasets. In our\nevaluation, the proposed method outperformed other MOT methods, achieving 0.98,\n0.68, and 0.98 in multiple object tracking accuracy (MOTA), higher order\ntracking accuracy (HOTA), and iterative and discriminative framework 1 (IDF1)\nmetrics, respectively.\n","authors":["Somaieh Amraee","Bishoy Galoaa","Matthew Goodwin","Elaheh Hatamimajoumerd","Sarah Ostadabbas"],"pdf_url":"https://arxiv.org/pdf/2311.17656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17655v1","updated":"2023-11-29T14:18:04Z","published":"2023-11-29T14:18:04Z","title":"Vulnerability of Automatic Identity Recognition to Audio-Visual\n Deepfakes","summary":" The task of deepfakes detection is far from being solved by speech or vision\nresearchers. Several publicly available databases of fake synthetic video and\nspeech were built to aid the development of detection methods. However,\nexisting databases typically focus on visual or voice modalities and provide no\nproof that their deepfakes can in fact impersonate any real person. In this\npaper, we present the first realistic audio-visual database of deepfakes\nSWAN-DF, where lips and speech are well synchronized and video have high visual\nand audio qualities. We took the publicly available SWAN dataset of real videos\nwith different identities to create audio-visual deepfakes using several models\nfrom DeepFaceLab and blending techniques for face swapping and HiFiVC, DiffVC,\nYourTTS, and FreeVC models for voice conversion. From the publicly available\nspeech dataset LibriTTS, we also created a separate database of only audio\ndeepfakes LibriTTS-DF using several latest text to speech methods: YourTTS,\nAdaspeech, and TorToiSe. We demonstrate the vulnerability of a state of the art\nspeaker recognition system, such as ECAPA-TDNN-based model from SpeechBrain, to\nthe synthetic voices. Similarly, we tested face recognition system based on the\nMobileFaceNet architecture to several variants of our visual deepfakes. The\nvulnerability assessment show that by tuning the existing pretrained deepfake\nmodels to specific identities, one can successfully spoof the face and speaker\nrecognition systems in more than 90% of the time and achieve a very realistic\nlooking and sounding fake video of a given person.\n","authors":["Pavel Korshunov","Haolin Chen","Philip N. Garner","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2311.17655v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.15556v2","updated":"2023-11-29T14:16:08Z","published":"2023-11-27T05:53:03Z","title":"PKU-I2IQA: An Image-to-Image Quality Assessment Database for AI\n Generated Images","summary":" As image generation technology advances, AI-based image generation has been\napplied in various fields and Artificial Intelligence Generated Content (AIGC)\nhas garnered widespread attention. However, the development of AI-based image\ngenerative models also brings new problems and challenges. A significant\nchallenge is that AI-generated images (AIGI) may exhibit unique distortions\ncompared to natural images, and not all generated images meet the requirements\nof the real world. Therefore, it is of great significance to evaluate AIGIs\nmore comprehensively. Although previous work has established several human\nperception-based AIGC image quality assessment (AIGCIQA) databases for\ntext-generated images, the AI image generation technology includes scenarios\nlike text-to-image and image-to-image, and assessing only the images generated\nby text-to-image models is insufficient. To address this issue, we establish a\nhuman perception-based image-to-image AIGCIQA database, named PKU-I2IQA. We\nconduct a well-organized subjective experiment to collect quality labels for\nAIGIs and then conduct a comprehensive analysis of the PKU-I2IQA database.\nFurthermore, we have proposed two benchmark models: NR-AIGCIQA based on the\nno-reference image quality assessment method and FR-AIGCIQA based on the\nfull-reference image quality assessment method. Finally, leveraging this\ndatabase, we conduct benchmark experiments and compare the performance of the\nproposed benchmark models. The PKU-I2IQA database and benchmarks will be\nreleased to facilitate future research on\n\\url{https://github.com/jiquan123/I2IQA}.\n","authors":["Jiquan Yuan","Xinyan Cao","Changjin Li","Fanyi Yang","Jinlong Lin","Xixin Cao"],"pdf_url":"https://arxiv.org/pdf/2311.15556v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2111.12971v3","updated":"2023-11-29T14:15:29Z","published":"2021-11-25T09:08:45Z","title":"Natural & Adversarial Bokeh Rendering via Circle-of-Confusion Predictive\n Network","summary":" Bokeh effect is a natural shallow depth-of-field phenomenon that blurs the\nout-of-focus part in photography. In recent years, a series of works have\nproposed automatic and realistic bokeh rendering methods for artistic and\naesthetic purposes. They usually employ cutting-edge data-driven deep\ngenerative networks with complex training strategies and network architectures.\nHowever, these works neglect that the bokeh effect, as a real phenomenon, can\ninevitably affect the subsequent visual intelligent tasks like recognition, and\ntheir data-driven nature prevents them from studying the influence of\nbokeh-related physical parameters (i.e., depth-of-the-field) on the intelligent\ntasks. To fill this gap, we study a totally new problem, i.e., natural &\nadversarial bokeh rendering, which consists of two objectives: rendering\nrealistic and natural bokeh and fooling the visual perception models (i.e.,\nbokeh-based adversarial attack). To this end, beyond the pure data-driven\nsolution, we propose a hybrid alternative by taking the respective advantages\nof data-driven and physical-aware methods. Specifically, we propose the\ncircle-of-confusion predictive network (CoCNet) by taking the all-in-focus\nimage and depth image as inputs to estimate circle-of-confusion parameters for\neach pixel, which are employed to render the final image through a well-known\nphysical model of bokeh. With the hybrid solution, our method could achieve\nmore realistic rendering results with the naive training strategy and a much\nlighter network.\n","authors":["Yihao Huang","Felix Juefei-Xu","Qing Guo","Geguang Pu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2111.12971v3.pdf","comment":"11 pages, accepted by TMM"},{"id":"http://arxiv.org/abs/2311.17647v1","updated":"2023-11-29T14:08:53Z","published":"2023-11-29T14:08:53Z","title":"VIM: Probing Multimodal Large Language Models for Visual Embedded\n Instruction Following","summary":" We introduce VISUAL EMBEDDED INSTRUCTION (VIM), a new framework designed to\nevaluate the visual instruction following capability of Multimodal Large\nLanguage Models (MLLMs). As illustrated in Figure 2, VIM challenges the MLLMs\nby embedding the instructions into the visual scenes, demanding strong visual\ninterpretative skills for instruction following. We adapt VIM to various\nbenchmarks, including VQAv2, MME, MM-Vet, and RefCOCO series, compose a VIM\nbench, and probe diverse MLLMs across three distinct in-context learning\nsettings: Zero Shot, One Shot, and Pair Shot. We observe that there is a\nsignificant performance disparity between the open-source MLLMs and GPT-4V,\nimplying that their proficiency in visual instruction comprehension is not up\nto par. Our results highlight a promising direction for the enhancement of\nMLLMs capabilities on instruction following. We aim VIM to serve as a useful\nnorm for advancing the state of the art and driving further progress in the\nfield.\n","authors":["Yujie Lu","Xiujun Li","William Yang Wang","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2311.17647v1.pdf","comment":"20 pages, 8 figures, 20 tables"},{"id":"http://arxiv.org/abs/2311.17643v1","updated":"2023-11-29T14:01:28Z","published":"2023-11-29T14:01:28Z","title":"Neural Fields with Thermal Activations for Arbitrary-Scale\n Super-Resolution","summary":" Recent approaches for arbitrary-scale single image super-resolution (ASSR)\nhave used local neural fields to represent continuous signals that can be\nsampled at different rates. However, in such formulation, the point-wise query\nof field values does not naturally match the point spread function (PSF) of a\ngiven pixel. In this work we present a novel way to design neural fields such\nthat points can be queried with a Gaussian PSF, which serves as anti-aliasing\nwhen moving across resolutions for ASSR. We achieve this using a novel\nactivation function derived from Fourier theory and the heat equation. This\ncomes at no additional cost: querying a point with a Gaussian PSF in our\nframework does not affect computational cost, unlike filtering in the image\ndomain. Coupled with a hypernetwork, our method not only provides theoretically\nguaranteed anti-aliasing, but also sets a new bar for ASSR while also being\nmore parameter-efficient than previous methods.\n","authors":["Alexander Becker","Rodrigo Caye Daudt","Nando Metzger","Jan Dirk Wegner","Konrad Schindler"],"pdf_url":"https://arxiv.org/pdf/2311.17643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17347v2","updated":"2023-11-29T13:55:37Z","published":"2023-10-26T12:27:56Z","title":"CADS: Unleashing the Diversity of Diffusion Models through\n Condition-Annealed Sampling","summary":" While conditional diffusion models are known to have good coverage of the\ndata distribution, they still face limitations in output diversity,\nparticularly when sampled with a high classifier-free guidance scale for\noptimal image quality or when trained on small datasets. We attribute this\nproblem to the role of the conditioning signal in inference and offer an\nimproved sampling strategy for diffusion models that can increase generation\ndiversity, especially at high guidance scales, with minimal loss of sample\nquality. Our sampling strategy anneals the conditioning signal by adding\nscheduled, monotonically decreasing Gaussian noise to the conditioning vector\nduring inference to balance diversity and condition alignment. Our\nCondition-Annealed Diffusion Sampler (CADS) can be used with any pretrained\nmodel and sampling algorithm, and we show that it boosts the diversity of\ndiffusion models in various conditional generation tasks. Further, using an\nexisting pretrained diffusion model, CADS achieves a new state-of-the-art FID\nof 1.70 and 2.31 for class-conditional ImageNet generation at 256$\\times$256\nand 512$\\times$512 respectively.\n","authors":["Seyedmorteza Sadat","Jakob Buhmann","Derek Bradley","Otmar Hilliges","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2310.17347v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17634v1","updated":"2023-11-29T13:51:12Z","published":"2023-11-29T13:51:12Z","title":"Erasing the Ephemeral: Joint Camera Refinement and Transient Object\n Removal for Street View Synthesis","summary":" Synthesizing novel views for urban environments is crucial for tasks like\nautonomous driving and virtual tours. Compared to object-level or indoor\nsituations, outdoor settings present unique challenges, such as inconsistency\nacross frames due to moving vehicles and camera pose drift over lengthy\nsequences. In this paper, we introduce a method that tackles these challenges\non view synthesis for outdoor scenarios. We employ a neural point light field\nscene representation and strategically detect and mask out dynamic objects to\nreconstruct novel scenes without artifacts. Moreover, we simultaneously\noptimize camera pose along with the view synthesis process, and thus, we\nsimultaneously refine both elements. Through validation on real-world urban\ndatasets, we demonstrate state-of-the-art results in synthesizing novel views\nof urban scenes.\n","authors":["Mreenav Shyam Deka","Lu Sang","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2311.17634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17629v1","updated":"2023-11-29T13:43:17Z","published":"2023-11-29T13:43:17Z","title":"Efficient Decoder for End-to-End Oriented Object Detection in Remote\n Sensing Images","summary":" Object instances in remote sensing images often distribute with\nmulti-orientations, varying scales, and dense distribution. These issues bring\nchallenges to end-to-end oriented object detectors including multi-scale\nfeatures alignment and a large number of queries. To address these limitations,\nwe propose an end-to-end oriented detector equipped with an efficient decoder,\nwhich incorporates two technologies, Rotated RoI attention (RRoI attention) and\nSelective Distinct Queries (SDQ). Specifically, RRoI attention effectively\nfocuses on oriented regions of interest through a cross-attention mechanism and\naligns multi-scale features. SDQ collects queries from intermediate decoder\nlayers and then filters similar queries to obtain distinct queries. The\nproposed SDQ can facilitate the optimization of one-to-one label assignment,\nwithout introducing redundant initial queries or extra auxiliary branches.\nExtensive experiments on five datasets demonstrate the effectiveness of our\nmethod. Notably, our method achieves state-of-the-art performance on DIOR-R\n(67.31% mAP), DOTA-v1.5 (67.43% mAP), and DOTA-v2.0 (53.28% mAP) with the\nResNet50 backbone.\n","authors":["Jiaqi Zhao","Zeyu Ding","Yong Zhou","Hancheng Zhu","Wenliang Du","Rui Yao","Abdulmotaleb El Saddik"],"pdf_url":"https://arxiv.org/pdf/2311.17629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17626v1","updated":"2023-11-29T13:39:18Z","published":"2023-11-29T13:39:18Z","title":"Focus on Query: Adversarial Mining Transformer for Few-Shot Segmentation","summary":" Few-shot segmentation (FSS) aims to segment objects of new categories given\nonly a handful of annotated samples. Previous works focus their efforts on\nexploring the support information while paying less attention to the mining of\nthe critical query branch. In this paper, we rethink the importance of support\ninformation and propose a new query-centric FSS model Adversarial Mining\nTransformer (AMFormer), which achieves accurate query image segmentation with\nonly rough support guidance or even weak support labels. The proposed AMFormer\nenjoys several merits. First, we design an object mining transformer (G) that\ncan achieve the expansion of incomplete region activated by support clue, and a\ndetail mining transformer (D) to discriminate the detailed local difference\nbetween the expanded mask and the ground truth. Second, we propose to train G\nand D via an adversarial process, where G is optimized to generate more\naccurate masks approaching ground truth to fool D. We conduct extensive\nexperiments on commonly used Pascal-5i and COCO-20i benchmarks and achieve\nstate-of-the-art results across all settings. In addition, the decent\nperformance with weak support labels in our query-centric paradigm may inspire\nthe development of more general FSS models. Code will be available at\nhttps://github.com/Wyxdm/AMNet.\n","authors":["Yuan Wang","Naisong Luo","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.17626v1.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17618v1","updated":"2023-11-29T13:26:29Z","published":"2023-11-29T13:26:29Z","title":"ShapeGPT: 3D Shape Generation with A Unified Multi-modal Language Model","summary":" The advent of large language models, enabling flexibility through\ninstruction-driven approaches, has revolutionized many traditional generative\ntasks, but large models for 3D data, particularly in comprehensively handling\n3D shapes with other modalities, are still under-explored. By achieving\ninstruction-based shape generations, versatile multimodal generative shape\nmodels can significantly benefit various fields like 3D virtual construction\nand network-aided design. In this work, we present ShapeGPT, a shape-included\nmulti-modal framework to leverage strong pre-trained language models to address\nmultiple shape-relevant tasks. Specifically, ShapeGPT employs a\nword-sentence-paragraph framework to discretize continuous shapes into shape\nwords, further assembles these words for shape sentences, as well as integrates\nshape with instructional text for multi-modal paragraphs. To learn this\nshape-language model, we use a three-stage training scheme, including shape\nrepresentation, multimodal alignment, and instruction-based generation, to\nalign shape-language codebooks and learn the intricate correlations among these\nmodalities. Extensive experiments demonstrate that ShapeGPT achieves comparable\nperformance across shape-relevant tasks, including text-to-shape,\nshape-to-text, shape completion, and shape editing.\n","authors":["Fukun Yin","Xin Chen","Chi Zhang","Biao Jiang","Zibo Zhao","Jiayuan Fan","Gang Yu","Taihao Li","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.04671v3","updated":"2023-11-29T13:20:22Z","published":"2022-10-10T13:20:51Z","title":"TCDM: Transformational Complexity Based Distortion Metric for Perceptual\n Point Cloud Quality Assessment","summary":" The goal of objective point cloud quality assessment (PCQA) research is to\ndevelop quantitative metrics that measure point cloud quality in a perceptually\nconsistent manner. Merging the research of cognitive science and intuition of\nthe human visual system (HVS), in this paper, we evaluate the point cloud\nquality by measuring the complexity of transforming the distorted point cloud\nback to its reference, which in practice can be approximated by the code length\nof one point cloud when the other is given. For this purpose, we first make\nspace segmentation for the reference and distorted point clouds based on a 3D\nVoronoi diagram to obtain a series of local patch pairs. Next, inspired by the\npredictive coding theory, we utilize a space-aware vector autoregressive\n(SA-VAR) model to encode the geometry and color channels of each reference\npatch with and without the distorted patch, respectively. Assuming that the\nresidual errors follow the multi-variate Gaussian distributions, the\nself-complexity of the reference and transformational complexity between the\nreference and distorted samples are computed using covariance matrices.\nAdditionally, the prediction terms generated by SA-VAR are introduced as one\nauxiliary feature to promote the final quality prediction. The effectiveness of\nthe proposed transformational complexity based distortion metric (TCDM) is\nevaluated through extensive experiments conducted on five public point cloud\nquality assessment databases. The results demonstrate that TCDM achieves\nstate-of-the-art (SOTA) performance, and further analysis confirms its\nrobustness in various scenarios. The code is publicly available at\nhttps://github.com/zyj1318053/TCDM.\n","authors":["Yujie Zhang","Qi Yang","Yifei Zhou","Xiaozhong Xu","Le Yang","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2210.04671v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16835v2","updated":"2023-11-29T13:14:58Z","published":"2023-11-28T14:51:08Z","title":"Unified-modal Salient Object Detection via Adaptive Prompt Learning","summary":" Existing single-modal and multi-modal salient object detection (SOD) methods\nfocus on designing specific architectures tailored for their respective tasks.\nHowever, developing completely different models for different tasks leads to\nlabor and time consumption, as well as high computational and practical\ndeployment costs. In this paper, we make the first attempt to address both\nsingle-modal and multi-modal SOD in a unified framework called UniSOD.\nNevertheless, assigning appropriate strategies to modality variable inputs is\nchallenging. To this end, UniSOD learns modality-aware prompts with\ntask-specific hints through adaptive prompt learning, which are plugged into\nthe proposed pre-trained baseline SOD model to handle corresponding tasks,\nwhile only requiring few learnable parameters compared to training the entire\nmodel. Each modality-aware prompt is generated from a switchable prompt\ngeneration block, which performs structural switching solely relied on\nsingle-modal and multi-modal inputs. UniSOD achieves consistent performance\nimprovement on 14 benchmark datasets for RGB, RGB-D, and RGB-T SOD, which\ndemonstrates that our method effectively and efficiently unifies single-modal\nand multi-modal SOD tasks.\n","authors":["Kunpeng Wang","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2311.16835v2.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.17609v1","updated":"2023-11-29T13:06:48Z","published":"2023-11-29T13:06:48Z","title":"AnyLens: A Generative Diffusion Model with Any Rendering Lens","summary":" State-of-the-art diffusion models can generate highly realistic images based\non various conditioning like text, segmentation, and depth. However, an\nessential aspect often overlooked is the specific camera geometry used during\nimage capture. The influence of different optical systems on the final scene\nappearance is frequently overlooked. This study introduces a framework that\nintimately integrates a text-to-image diffusion model with the particular lens\ngeometry used in image rendering. Our method is based on a per-pixel coordinate\nconditioning method, enabling the control over the rendering geometry. Notably,\nwe demonstrate the manipulation of curvature properties, achieving diverse\nvisual effects, such as fish-eye, panoramic views, and spherical texturing\nusing a single diffusion model.\n","authors":["Andrey Voynov","Amir Hertz","Moab Arar","Shlomi Fruchter","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2311.17609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17608v1","updated":"2023-11-29T13:05:20Z","published":"2023-11-29T13:05:20Z","title":"Adversarial Robust Memory-Based Continual Learner","summary":" Despite the remarkable advances that have been made in continual learning,\nthe adversarial vulnerability of such methods has not been fully discussed. We\ndelve into the adversarial robustness of memory-based continual learning\nalgorithms and observe limited robustness improvement by directly applying\nadversarial training techniques. Preliminary studies reveal the twin challenges\nfor building adversarial robust continual learners: accelerated forgetting in\ncontinual learning and gradient obfuscation in adversarial robustness. In this\nstudy, we put forward a novel adversarial robust memory-based continual learner\nthat adjusts data logits to mitigate the forgetting of pasts caused by\nadversarial samples. Furthermore, we devise a gradient-based data selection\nmechanism to overcome the gradient obfuscation caused by limited stored data.\nThe proposed approach can widely integrate with existing memory-based continual\nlearning as well as adversarial training algorithms in a plug-and-play way.\nExtensive experiments on Split-CIFAR10/100 and Split-Tiny-ImageNet demonstrate\nthe effectiveness of our approach, achieving up to 8.13% higher accuracy for\nadversarial data.\n","authors":["Xiaoyue Mi","Fan Tang","Zonghan Yang","Danding Wang","Juan Cao","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17607v1","updated":"2023-11-29T13:05:06Z","published":"2023-11-29T13:05:06Z","title":"Topology-Preserving Adversarial Training","summary":" Despite the effectiveness in improving the robustness of neural networks,\nadversarial training has suffered from the natural accuracy degradation\nproblem, i.e., accuracy on natural samples has reduced significantly. In this\nstudy, we reveal that natural accuracy degradation is highly related to the\ndisruption of the natural sample topology in the representation space by\nquantitative and qualitative experiments. Based on this observation, we propose\nTopology-pReserving Adversarial traINing (TRAIN) to alleviate the problem by\npreserving the topology structure of natural samples from a standard model\ntrained only on natural samples during adversarial training. As an additional\nregularization, our method can easily be combined with various popular\nadversarial training algorithms in a plug-and-play manner, taking advantage of\nboth sides. Extensive experiments on CIFAR-10, CIFAR-100, and Tiny ImageNet\nshow that our proposed method achieves consistent and significant improvements\nover various strong baselines in most cases. Specifically, without additional\ndata, our proposed method achieves up to 8.78% improvement in natural accuracy\nand 4.50% improvement in robust accuracy.\n","authors":["Xiaoyue Mi","Fan Tang","Yepeng Weng","Danding Wang","Juan Cao","Sheng Tang","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05055v2","updated":"2023-11-29T12:59:20Z","published":"2023-10-08T07:41:15Z","title":"FairTune: Optimizing Parameter Efficient Fine Tuning for Fairness in\n Medical Image Analysis","summary":" Training models with robust group fairness properties is crucial in ethically\nsensitive application areas such as medical diagnosis. Despite the growing body\nof work aiming to minimise demographic bias in AI, this problem remains\nchallenging. A key reason for this challenge is the fairness generalisation\ngap: High-capacity deep learning models can fit all training data nearly\nperfectly, and thus also exhibit perfect fairness during training. In this\ncase, bias emerges only during testing when generalisation performance differs\nacross subgroups. This motivates us to take a bi-level optimisation perspective\non fair learning: Optimising the learning strategy based on validation\nfairness. Specifically, we consider the highly effective workflow of adapting\npre-trained models to downstream medical imaging tasks using\nparameter-efficient fine-tuning (PEFT) techniques. There is a trade-off between\nupdating more parameters, enabling a better fit to the task of interest vs.\nfewer parameters, potentially reducing the generalisation gap. To manage this\ntradeoff, we propose FairTune, a framework to optimise the choice of PEFT\nparameters with respect to fairness. We demonstrate empirically that FairTune\nleads to improved fairness on a range of medical imaging datasets.\n","authors":["Raman Dutt","Ondrej Bohdal","Sotirios A. Tsaftaris","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2310.05055v2.pdf","comment":"9 pages, 2 tables, 4 figures"},{"id":"http://arxiv.org/abs/2311.17600v1","updated":"2023-11-29T12:49:45Z","published":"2023-11-29T12:49:45Z","title":"Query-Relevant Images Jailbreak Large Multi-Modal Models","summary":" Warning: This paper contains examples of harmful language and images, and\nreader discretion is recommended. The security concerns surrounding Large\nLanguage Models (LLMs) have been extensively explored, yet the safety of Large\nMulti-Modal Models (LMMs) remains understudied. In our study, we present a\nnovel visual prompt attack that exploits query-relevant images to jailbreak the\nopen-source LMMs. Our method creates a composite image from one image generated\nby diffusion models and another that displays the text as typography, based on\nkeywords extracted from a malicious query. We show LLMs can be easily attacked\nby our approach, even if the employed Large Language Models are safely aligned.\nTo evaluate the extent of this vulnerability in open-source LMMs, we have\ncompiled a substantial dataset encompassing 13 scenarios with a total of 5,040\ntext-image pairs, using our presented attack technique. Our evaluation of 12\ncutting-edge LMMs using this dataset shows the vulnerability of existing\nmulti-modal models on adversarial attacks. This finding underscores the need\nfor a concerted effort to strengthen and enhance the safety measures of\nopen-source LMMs against potential malicious exploits. The resource is\navailable at \\href{this https URL}{https://github.com/isXinLiu/MM-SafetyBench}.\n","authors":["Xin Liu","Yichen Zhu","Yunshi Lan","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2311.17600v1.pdf","comment":"Technique report"},{"id":"http://arxiv.org/abs/2306.16269v2","updated":"2023-11-29T12:47:59Z","published":"2023-06-28T14:51:34Z","title":"RSPrompter: Learning to Prompt for Remote Sensing Instance Segmentation\n based on Visual Foundation Model","summary":" Leveraging the extensive training data from SA-1B, the Segment Anything Model\n(SAM) demonstrates remarkable generalization and zero-shot capabilities.\nHowever, as a category-agnostic instance segmentation method, SAM heavily\nrelies on prior manual guidance, including points, boxes, and coarse-grained\nmasks. Furthermore, its performance in remote sensing image segmentation tasks\nremains largely unexplored and unproven. In this paper, we aim to develop an\nautomated instance segmentation approach for remote sensing images, based on\nthe foundational SAM model and incorporating semantic category information.\nDrawing inspiration from prompt learning, we propose a method to learn the\ngeneration of appropriate prompts for SAM. This enables SAM to produce\nsemantically discernible segmentation results for remote sensing images, a\nconcept we have termed RSPrompter. We also propose several ongoing derivatives\nfor instance segmentation tasks, drawing on recent advancements within the SAM\ncommunity, and compare their performance with RSPrompter. Extensive\nexperimental results, derived from the WHU building, NWPU VHR-10, and SSDD\ndatasets, validate the effectiveness of our proposed method. The code for our\nmethod is publicly available at kychen.me/RSPrompter.\n","authors":["Keyan Chen","Chenyang Liu","Hao Chen","Haotian Zhang","Wenyuan Li","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2306.16269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17597v1","updated":"2023-11-29T12:47:42Z","published":"2023-11-29T12:47:42Z","title":"Continual Self-supervised Learning: Towards Universal Multi-modal\n Medical Data Representation Learning","summary":" Self-supervised learning is an efficient pre-training method for medical\nimage analysis. However, current research is mostly confined to\nspecific-modality data pre-training, consuming considerable time and resources\nwithout achieving universality across different modalities. A straightforward\nsolution is combining all modality data for joint self-supervised pre-training,\nwhich poses practical challenges. Firstly, our experiments reveal conflicts in\nrepresentation learning as the number of modalities increases. Secondly,\nmulti-modal data collected in advance cannot cover all real-world scenarios. In\nthis paper, we reconsider versatile self-supervised learning from the\nperspective of continual learning and propose MedCoSS, a continuous\nself-supervised learning approach for multi-modal medical data. Unlike joint\nself-supervised learning, MedCoSS assigns different modality data to different\ntraining stages, forming a multi-stage pre-training process. To balance modal\nconflicts and prevent catastrophic forgetting, we propose a rehearsal-based\ncontinual learning method. We introduce the k-means sampling strategy to retain\ndata from previous modalities and rehearse it when learning new modalities.\nInstead of executing the pretext task on buffer data, a feature distillation\nstrategy and an intra-modal mixup strategy are applied to these data for\nknowledge retention. We conduct continuous self-supervised pre-training on a\nlarge-scale multi-modal unlabeled dataset, including clinical reports, X-rays,\nCT scans, MRI scans, and pathological images. Experimental results demonstrate\nMedCoSS's exceptional generalization ability across nine downstream datasets\nand its significant scalability in integrating new modality data. Code and\npre-trained weight are available at https://github.com/yeerwen/MedCoSS.\n","authors":["Yiwen Ye","Yutong Xie","Jianpeng Zhang","Ziyang Chen","Qi Wu","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2311.17597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17593v1","updated":"2023-11-29T12:41:55Z","published":"2023-11-29T12:41:55Z","title":"LanGWM: Language Grounded World Model","summary":" Recent advances in deep reinforcement learning have showcased its potential\nin tackling complex tasks. However, experiments on visual control tasks have\nrevealed that state-of-the-art reinforcement learning models struggle with\nout-of-distribution generalization. Conversely, expressing higher-level\nconcepts and global contexts is relatively easy using language.\n Building upon recent success of the large language models, our main objective\nis to improve the state abstraction technique in reinforcement learning by\nleveraging language for robust action selection. Specifically, we focus on\nlearning language-grounded visual features to enhance the world model learning,\na model-based reinforcement learning technique.\n To enforce our hypothesis explicitly, we mask out the bounding boxes of a few\nobjects in the image observation and provide the text prompt as descriptions\nfor these masked objects. Subsequently, we predict the masked objects along\nwith the surrounding regions as pixel reconstruction, similar to the\ntransformer-based masked autoencoder approach.\n Our proposed LanGWM: Language Grounded World Model achieves state-of-the-art\nperformance in out-of-distribution test at the 100K interaction steps\nbenchmarks of iGibson point navigation tasks. Furthermore, our proposed\ntechnique of explicit language-grounded visual representation learning has the\npotential to improve models for human-robot interaction because our extracted\nvisual features are language grounded.\n","authors":["Rudra P. K. Poudel","Harit Pandya","Chao Zhang","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2311.17593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17590v1","updated":"2023-11-29T12:35:34Z","published":"2023-11-29T12:35:34Z","title":"SyncTalk: The Devil is in the Synchronization for Talking Head Synthesis","summary":" Achieving high synchronization in the synthesis of realistic, speech-driven\ntalking head videos presents a significant challenge. Traditional Generative\nAdversarial Networks (GAN) struggle to maintain consistent facial identity,\nwhile Neural Radiance Fields (NeRF) methods, although they can address this\nissue, often produce mismatched lip movements, inadequate facial expressions,\nand unstable head poses. A lifelike talking head requires synchronized\ncoordination of subject identity, lip movements, facial expressions, and head\nposes. The absence of these synchronizations is a fundamental flaw, leading to\nunrealistic and artificial outcomes. To address the critical issue of\nsynchronization, identified as the \"devil\" in creating realistic talking heads,\nwe introduce SyncTalk. This NeRF-based method effectively maintains subject\nidentity, enhancing synchronization and realism in talking head synthesis.\nSyncTalk employs a Face-Sync Controller to align lip movements with speech and\ninnovatively uses a 3D facial blendshape model to capture accurate facial\nexpressions. Our Head-Sync Stabilizer optimizes head poses, achieving more\nnatural head movements. The Portrait-Sync Generator restores hair details and\nblends the generated head with the torso for a seamless visual experience.\nExtensive experiments and user studies demonstrate that SyncTalk outperforms\nstate-of-the-art methods in synchronization and realism. We recommend watching\nthe supplementary video: https://ziqiaopeng.github.io/synctalk\n","authors":["Ziqiao Peng","Wentao Hu","Yue Shi","Xiangyu Zhu","Xiaomei Zhang","Hao Zhao","Jun He","Hongyan Liu","Zhaoxin Fan"],"pdf_url":"https://arxiv.org/pdf/2311.17590v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2302.02124v2","updated":"2023-11-29T12:31:42Z","published":"2023-02-04T07:50:31Z","title":"Transform, Contrast and Tell: Coherent Entity-Aware Multi-Image\n Captioning","summary":" Coherent entity-aware multi-image captioning aims to generate coherent\ncaptions for neighboring images in a news document. There are coherence\nrelationships among neighboring images because they often describe same\nentities or events. These relationships are important for entity-aware\nmulti-image captioning, but are neglected in entity-aware single-image\ncaptioning. Most existing work focuses on single-image captioning, while\nmulti-image captioning has not been explored before. Hence, this paper proposes\na coherent entity-aware multi-image captioning model by making use of coherence\nrelationships. The model consists of a Transformer-based caption generation\nmodel and two types of contrastive learning-based coherence mechanisms. The\ngeneration model generates the caption by paying attention to the image and the\naccompanying text. The caption-caption coherence mechanism aims to render\nentities in the caption of the image be also in captions of neighboring images.\nThe caption-image-text coherence mechanism aims to render entities in the\ncaption of the image be also in the accompanying text. To evaluate coherence\nbetween captions, two coherence evaluation metrics are proposed. The new\ndataset DM800K is constructed that has more images per document than two\nexisting datasets GoodNews and NYT800K, and is more suitable for multi-image\ncaptioning. Experiments on three datasets show the proposed captioning model\noutperforms 7 baselines according to BLUE, Rouge, METEOR, and entity precision\nand recall scores. Experiments also show that the generated captions are more\ncoherent than that of baselines according to caption entity scores, caption\nRouge scores, the two proposed coherence evaluation metrics, and human\nevaluations.\n","authors":["Jingqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2302.02124v2.pdf","comment":"32 pages, 11 tables, 3 figures"},{"id":"http://arxiv.org/abs/2311.17583v1","updated":"2023-11-29T12:21:42Z","published":"2023-11-29T12:21:42Z","title":"CLIPC8: Face liveness detection algorithm based on image-text pairs and\n contrastive learning","summary":" Face recognition technology is widely used in the financial field, and\nvarious types of liveness attack behaviors need to be addressed. Existing\nliveness detection algorithms are trained on specific training datasets and\ntested on testing datasets, but their performance and robustness in\ntransferring to unseen datasets are relatively poor. To tackle this issue, we\npropose a face liveness detection method based on image-text pairs and\ncontrastive learning, dividing liveness attack problems in the financial field\ninto eight categories and using text information to describe the images of\nthese eight types of attacks. The text encoder and image encoder are used to\nextract feature vector representations for the classification description text\nand face images, respectively. By maximizing the similarity of positive samples\nand minimizing the similarity of negative samples, the model learns shared\nrepresentations between images and texts. The proposed method is capable of\neffectively detecting specific liveness attack behaviors in certain scenarios,\nsuch as those occurring in dark environments or involving the tampering of ID\ncard photos. Additionally, it is also effective in detecting traditional\nliveness attack methods, such as printing photo attacks and screen remake\nattacks. The zero-shot capabilities of face liveness detection on five public\ndatasets, including NUAA, CASIA-FASD, Replay-Attack, OULU-NPU and MSU-MFSD also\nreaches the level of commercial algorithms. The detection capability of\nproposed algorithm was verified on 5 types of testing datasets, and the results\nshow that the method outperformed commercial algorithms, and the detection\nrates reached 100% on multiple datasets. Demonstrating the effectiveness and\nrobustness of introducing image-text pairs and contrastive learning into\nliveness detection tasks as proposed in this paper.\n","authors":["Xu Liu","Shu Zhou","Yurong Song","Wenzhe Luo","Xin Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.17583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17571v1","updated":"2023-11-29T12:06:19Z","published":"2023-11-29T12:06:19Z","title":"LGFCTR: Local and Global Feature Convolutional Transformer for Image\n Matching","summary":" Image matching that finding robust and accurate correspondences across images\nis a challenging task under extreme conditions. Capturing local and global\nfeatures simultaneously is an important way to mitigate such an issue but\nrecent transformer-based decoders were still stuck in the issues that CNN-based\nencoders only extract local features and the transformers lack locality.\nInspired by the locality and implicit positional encoding of convolutions, a\nnovel convolutional transformer is proposed to capture both local contexts and\nglobal structures more sufficiently for detector-free matching. Firstly, a\nuniversal FPN-like framework captures global structures in self-encoder as well\nas cross-decoder by transformers and compensates local contexts as well as\nimplicit positional encoding by convolutions. Secondly, a novel convolutional\ntransformer module explores multi-scale long range dependencies by a novel\nmulti-scale attention and further aggregates local information inside\ndependencies for enhancing locality. Finally, a novel regression-based\nsub-pixel refinement module exploits the whole fine-grained window features for\nfine-level positional deviation regression. The proposed method achieves\nsuperior performances on a wide range of benchmarks. The code will be available\non https://github.com/zwh0527/LGFCTR.\n","authors":["Wenhao Zhong","Jie Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17571v1.pdf","comment":"8 pages of main text, 7 pages of supplementary material, 3 pages of\n references, 6 figures in main text and 8 figures in supplementary material, 5\n tables in main text and 2 tables in supplementary material"},{"id":"http://arxiv.org/abs/2311.14284v2","updated":"2023-11-29T12:01:35Z","published":"2023-11-24T05:17:01Z","title":"Paragraph-to-Image Generation with Information-Enriched Diffusion Model","summary":" Text-to-image (T2I) models have recently experienced rapid development,\nachieving astonishing performance in terms of fidelity and textual alignment\ncapabilities. However, given a long paragraph (up to 512 words), these\ngeneration models still struggle to achieve strong alignment and are unable to\ngenerate images depicting complex scenes. In this paper, we introduce an\ninformation-enriched diffusion model for paragraph-to-image generation task,\ntermed ParaDiffusion, which delves into the transference of the extensive\nsemantic comprehension capabilities of large language models to the task of\nimage generation. At its core is using a large language model (e.g., Llama V2)\nto encode long-form text, followed by fine-tuning with LORA to alignthe\ntext-image feature spaces in the generation task. To facilitate the training of\nlong-text semantic alignment, we also curated a high-quality paragraph-image\npair dataset, namely ParaImage. This dataset contains a small amount of\nhigh-quality, meticulously annotated data, and a large-scale synthetic dataset\nwith long text descriptions being generated using a vision-language model.\nExperiments demonstrate that ParaDiffusion outperforms state-of-the-art models\n(SD XL, DeepFloyd IF) on ViLG-300 and ParaPrompts, achieving up to 15% and 45%\nhuman voting rate improvements for visual appeal and text faithfulness,\nrespectively. The code and dataset will be released to foster community\nresearch on long-text alignment.\n","authors":["Weijia Wu","Zhuang Li","Yefei He","Mike Zheng Shou","Chunhua Shen","Lele Cheng","Yan Li","Tingting Gao","Di Zhang","Zhongyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14284v2.pdf","comment":"The project website is at:\n https://weijiawu.github.io/ParaDiffusionPage/. Code:\n https://github.com/weijiawu/ParaDiffusion"},{"id":"http://arxiv.org/abs/2306.16741v3","updated":"2023-11-29T11:54:44Z","published":"2023-06-29T07:34:25Z","title":"Foundation Model for Endoscopy Video Analysis via Large-scale\n Self-supervised Pre-train","summary":" Foundation models have exhibited remarkable success in various applications,\nsuch as disease diagnosis and text report generation. To date, a foundation\nmodel for endoscopic video analysis is still lacking. In this paper, we propose\nEndo-FM, a foundation model specifically developed using massive endoscopic\nvideo data. First, we build a video transformer, which captures both local and\nglobal long-range dependencies across spatial and temporal dimensions. Second,\nwe pre-train our transformer model using global and local views via a\nself-supervised manner, aiming to make it robust to spatial-temporal variations\nand discriminative across different scenes. To develop the foundation model, we\nconstruct a large-scale endoscopy video dataset by combining 9 publicly\navailable datasets and a privately collected dataset from Baoshan Branch of\nRenji Hospital in Shanghai, China. Our dataset overall consists of over 33K\nvideo clips with up to 5 million frames, encompassing various protocols, target\norgans, and disease types. Our pre-trained Endo-FM can be easily adopted for a\ngiven downstream task via fine-tuning by serving as the backbone. With\nexperiments on 3 different types of downstream tasks, including classification,\nsegmentation, and detection, our Endo-FM surpasses the current state-of-the-art\n(SOTA) self-supervised pre-training and adapter-based transfer learning methods\nby a significant margin, such as VCL (3.1% F1, 4.8% Dice, and 5.5% F1 for\nclassification, segmentation, and detection) and ST-Adapter (5.9% F1, 9.6%\nDice, and 9.9% F1 for classification, segmentation, and detection). Code,\ndatasets, and models are released at https://github.com/med-air/Endo-FM.\n","authors":["Zhao Wang","Chang Liu","Shaoting Zhang","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2306.16741v3.pdf","comment":"MICCAI 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2311.17552v1","updated":"2023-11-29T11:35:54Z","published":"2023-11-29T11:35:54Z","title":"An Efficient Illumination Invariant Tiger Detection Framework for\n Wildlife Surveillance","summary":" Tiger conservation necessitates the strategic deployment of multifaceted\ninitiatives encompassing the preservation of ecological habitats, anti-poaching\nmeasures, and community involvement for sustainable growth in the tiger\npopulation. With the advent of artificial intelligence, tiger surveillance can\nbe automated using object detection. In this paper, an accurate illumination\ninvariant framework is proposed based on EnlightenGAN and YOLOv8 for tiger\ndetection. The fine-tuned YOLOv8 model achieves a mAP score of 61% without\nillumination enhancement. The illumination enhancement improves the mAP by\n0.7%. The approaches elevate the state-of-the-art performance on the ATRW\ndataset by approximately 6% to 7%.\n","authors":["Gaurav Pendharkar","A. Ancy Micheal","Jason Misquitta","Ranjeesh Kaippada"],"pdf_url":"https://arxiv.org/pdf/2311.17552v1.pdf","comment":"accepted at ICCIS 2023"},{"id":"http://arxiv.org/abs/2311.16668v2","updated":"2023-11-29T11:29:45Z","published":"2023-11-28T10:29:39Z","title":"LiveNVS: Neural View Synthesis on Live RGB-D Streams","summary":" Existing real-time RGB-D reconstruction approaches, like Kinect Fusion, lack\nreal-time photo-realistic visualization. This is due to noisy, oversmoothed or\nincomplete geometry and blurry textures which are fused from imperfect depth\nmaps and camera poses. Recent neural rendering methods can overcome many of\nsuch artifacts but are mostly optimized for offline usage, hindering the\nintegration into a live reconstruction pipeline.\n In this paper, we present LiveNVS, a system that allows for neural novel view\nsynthesis on a live RGB-D input stream with very low latency and real-time\nrendering. Based on the RGB-D input stream, novel views are rendered by\nprojecting neural features into the target view via a densely fused depth map\nand aggregating the features in image-space to a target feature map. A\ngeneralizable neural network then translates the target feature map into a\nhigh-quality RGB image. LiveNVS achieves state-of-the-art neural rendering\nquality of unknown scenes during capturing, allowing users to virtually explore\nthe scene and assess reconstruction quality in real-time.\n","authors":["Laura Fink","Darius Rückert","Linus Franke","Joachim Keinert","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2311.16668v2.pdf","comment":"main paper: 8 pages, total number of pages: 15, 13 figures, to be\n published in SIGGRAPH Asia 2023 Conference Papers; edits: link was fixed"},{"id":"http://arxiv.org/abs/2311.17546v1","updated":"2023-11-29T11:28:26Z","published":"2023-11-29T11:28:26Z","title":"VINNA for Neonates -- Orientation Independence through Latent\n Augmentations","summary":" Fast and accurate segmentation of neonatal brain images is highly desired to\nbetter understand and detect changes during development and disease. Yet, the\nlimited availability of ground truth datasets, lack of standardized acquisition\nprotocols, and wide variations of head positioning pose challenges for method\ndevelopment. A few automated image analysis pipelines exist for newborn brain\nMRI segmentation, but they often rely on time-consuming procedures and require\nresampling to a common resolution, subject to loss of information due to\ninterpolation and down-sampling. Without registration and image resampling,\nvariations with respect to head positions and voxel resolutions have to be\naddressed differently. In deep-learning, external augmentations are\ntraditionally used to artificially expand the representation of spatial\nvariability, increasing the training dataset size and robustness. However,\nthese transformations in the image space still require resampling, reducing\naccuracy specifically in the context of label interpolation. We recently\nintroduced the concept of resolution-independence with the Voxel-size\nIndependent Neural Network framework, VINN. Here, we extend this concept by\nadditionally shifting all rigid-transforms into the network architecture with a\nfour degree of freedom (4-DOF) transform module, enabling resolution-aware\ninternal augmentations (VINNA). In this work we show that VINNA (i)\nsignificantly outperforms state-of-the-art external augmentation approaches,\n(ii) effectively addresses the head variations present specifically in newborn\ndatasets, and (iii) retains high segmentation accuracy across a range of\nresolutions (0.5-1.0 mm). The 4-DOF transform module is a powerful, general\napproach to implement spatial augmentation without requiring image or label\ninterpolation. The specific network application to newborns will be made\npublicly available as VINNA4neonates.\n","authors":["Leonie Henschel","David Kügler","Lilla Zöllei","Martin Reuter"],"pdf_url":"https://arxiv.org/pdf/2311.17546v1.pdf","comment":"Under Review at Imaging Neuroscience"},{"id":"http://arxiv.org/abs/2311.17536v1","updated":"2023-11-29T11:14:43Z","published":"2023-11-29T11:14:43Z","title":"Smooth Video Synthesis with Noise Constraints on Diffusion Models for\n One-shot Video Tuning","summary":" Recent one-shot video tuning methods, which fine-tune the network on a\nspecific video based on pre-trained text-to-image models (e.g., Stable\nDiffusion), are popular in the community because of the flexibility. However,\nthese methods often produce videos marred by incoherence and inconsistency. To\naddress these limitations, this paper introduces a simple yet effective noise\nconstraint across video frames. This constraint aims to regulate noise\npredictions across their temporal neighbors, resulting in smooth latents. It\ncan be simply included as a loss term during the training phase. By applying\nthe loss to existing one-shot video tuning methods, we significantly improve\nthe overall consistency and smoothness of the generated videos. Furthermore, we\nargue that current video evaluation metrics inadequately capture smoothness. To\naddress this, we introduce a novel metric that considers detailed features and\ntheir temporal dynamics. Experimental results validate the effectiveness of our\napproach in producing smoother videos on various one-shot video tuning\nbaselines. The source codes and video demos are available at\n\\href{https://github.com/SPengLiang/SmoothVideo}{https://github.com/SPengLiang/SmoothVideo}.\n","authors":["Liang Peng","Haoran Cheng","Zheng Yang","Ruisi Zhao","Linxuan Xia","Chaotian Song","Qinglin Lu","Wei Liu","Boxi Wu"],"pdf_url":"https://arxiv.org/pdf/2311.17536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17532v1","updated":"2023-11-29T11:10:40Z","published":"2023-11-29T11:10:40Z","title":"Weakly-Supervised Emotion Transition Learning for Diverse 3D Co-speech\n Gesture Generation","summary":" Generating vivid and emotional 3D co-speech gestures is crucial for virtual\navatar animation in human-machine interaction applications. While the existing\nmethods enable generating the gestures to follow a single emotion label, they\noverlook that long gesture sequence modeling with emotion transition is more\npractical in real scenes. In addition, the lack of large-scale available\ndatasets with emotional transition speech and corresponding 3D human gestures\nalso limits the addressing of this task. To fulfill this goal, we first\nincorporate the ChatGPT-4 and an audio inpainting approach to construct the\nhigh-fidelity emotion transition human speeches. Considering obtaining the\nrealistic 3D pose annotations corresponding to the dynamically inpainted\nemotion transition audio is extremely difficult, we propose a novel weakly\nsupervised training strategy to encourage authority gesture transitions.\nSpecifically, to enhance the coordination of transition gestures w.r.t\ndifferent emotional ones, we model the temporal association representation\nbetween two different emotional gesture sequences as style guidance and infuse\nit into the transition generation. We further devise an emotion mixture\nmechanism that provides weak supervision based on a learnable mixed emotion\nlabel for transition gestures. Last, we present a keyframe sampler to supply\neffective initial posture cues in long sequences, enabling us to generate\ndiverse gestures. Extensive experiments demonstrate that our method outperforms\nthe state-of-the-art models constructed by adapting single emotion-conditioned\ncounterparts on our newly defined emotion transition task and datasets.\n","authors":["Xingqun Qi","Jiahao Pan","Peng Li","Ruibin Yuan","Xiaowei Chi","Mengfei Li","Wenhan Luo","Wei Xue","Shanghang Zhang","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2311.17532v1.pdf","comment":"The code and dataset will be released as soon as possible"},{"id":"http://arxiv.org/abs/2311.17528v1","updated":"2023-11-29T11:01:38Z","published":"2023-11-29T11:01:38Z","title":"HiDiffusion: Unlocking High-Resolution Creativity and Efficiency in\n Low-Resolution Trained Diffusion Models","summary":" We introduce HiDiffusion, a tuning-free framework comprised of\nResolution-Aware U-Net (RAU-Net) and Modified Shifted Window Multi-head\nSelf-Attention (MSW-MSA) to enable pretrained large text-to-image diffusion\nmodels to efficiently generate high-resolution images (e.g. 1024$\\times$1024)\nthat surpass the training image resolution. Pretrained diffusion models\nencounter unreasonable object duplication in generating images beyond the\ntraining image resolution. We attribute it to the mismatch between the feature\nmap size of high-resolution images and the receptive field of U-Net's\nconvolution. To address this issue, we propose a simple yet scalable method\nnamed RAU-Net. RAU-Net dynamically adjusts the feature map size to match the\nconvolution's receptive field in the deep block of U-Net. Another obstacle in\nhigh-resolution synthesis is the slow inference speed of U-Net. Our\nobservations reveal that the global self-attention in the top block, which\nexhibits locality, however, consumes the majority of computational resources.\nTo tackle this issue, we propose MSW-MSA. Unlike previous window attention\nmechanisms, our method uses a much larger window size and dynamically shifts\nwindows to better accommodate diffusion models. Extensive experiments\ndemonstrate that our HiDiffusion can scale diffusion models to generate\n1024$\\times$1024, 2048$\\times$2048, or even 4096$\\times$4096 resolution images,\nwhile simultaneously reducing inference time by 40\\%-60\\%, achieving\nstate-of-the-art performance on high-resolution image synthesis. The most\nsignificant revelation of our work is that a pretrained diffusion model on\nlow-resolution images is scalable for high-resolution generation without\nfurther tuning. We hope this revelation can provide insights for future\nresearch on the scalability of diffusion models.\n","authors":["Shen Zhang","Zhaowei Chen","Zhenyu Zhao","Zhenyuan Chen","Yao Tang","Yuhao Chen","Wengang Cao","Jiajun Liang"],"pdf_url":"https://arxiv.org/pdf/2311.17528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17525v1","updated":"2023-11-29T10:53:08Z","published":"2023-11-29T10:53:08Z","title":"A publicly available vessel segmentation algorithm for SLO images","summary":" Background and Objective: Infra-red scanning laser ophthalmoscope (IRSLO)\nimages are akin to colour fundus photographs in displaying the posterior pole\nand retinal vasculature fine detail. While there are many trained networks\nreadily available for retinal vessel segmentation in colour fundus photographs,\nnone cater to IRSLO images. Accordingly, we aimed to develop (and release as\nopen source) a vessel segmentation algorithm tailored specifically to IRSLO\nimages. Materials and Methods: We used 23 expertly annotated IRSLO images from\nthe RAVIR dataset, combined with 7 additional images annotated in-house. We\ntrained a U-Net (convolutional neural network) to label pixels as 'vessel' or\n'background'. Results: On an unseen test set (4 images), our model achieved an\nAUC of 0.981, and an AUPRC of 0.815. Upon thresholding, it achieved a\nsensitivity of 0.844, a specificity of 0.983, and an F1 score of 0.857.\nConclusion: We have made our automatic segmentation algorithm publicly\navailable and easy to use. Researchers can use the generated vessel maps to\ncompute metrics such as fractal dimension and vessel density.\n","authors":["Adam Threlfall","Samuel Gibbon","James Cameron","Tom MacGillivray"],"pdf_url":"https://arxiv.org/pdf/2311.17525v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.17524v1","updated":"2023-11-29T10:53:05Z","published":"2023-11-29T10:53:05Z","title":"Improving Stability during Upsampling -- on the Importance of Spatial\n Context","summary":" State-of-the-art models for pixel-wise prediction tasks such as image\nrestoration, image segmentation, or disparity estimation, involve several\nstages of data resampling, in which the resolution of feature maps is first\nreduced to aggregate information and then sequentially increased to generate a\nhigh-resolution output. Several previous works have investigated the effect of\nartifacts that are invoked during downsampling and diverse cures have been\nproposed that facilitate to improve prediction stability and even robustness\nfor image classification. However, equally relevant, artifacts that arise\nduring upsampling have been less discussed. This is significantly relevant as\nupsampling and downsampling approaches face fundamentally different challenges.\nWhile during downsampling, aliases and artifacts can be reduced by blurring\nfeature maps, the emergence of fine details is crucial during upsampling.\nBlurring is therefore not an option and dedicated operations need to be\nconsidered. In this work, we are the first to explore the relevance of context\nduring upsampling by employing convolutional upsampling operations with\nincreasing kernel size while keeping the encoder unchanged. We find that\nincreased kernel sizes can in general improve the prediction stability in tasks\nsuch as image restoration or image segmentation, while a block that allows for\na combination of small-size kernels for fine details and large-size kernels for\nartifact removal and increased context yields the best results.\n","authors":["Shashank Agnihotri","Julia Grabinski","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2311.17524v1.pdf","comment":"Stable upsampling, reduction in spectral artifacts"},{"id":"http://arxiv.org/abs/2305.18381v3","updated":"2023-11-29T10:46:19Z","published":"2023-05-28T06:53:41Z","title":"Distill Gold from Massive Ores: Efficient Dataset Distillation via\n Critical Samples Selection","summary":" Data-efficient learning has garnered significant attention, especially given\nthe current trend of large multi-modal models. Recently, dataset distillation\nbecomes an effective approach for data-efficiency; however, the distillation\nprocess itself can still be inefficient. In this work, we model the dataset\ndistillation task within the context of information transport. By observing the\nsubstantial data redundancy inherent in the distillation, we argue to put more\nemphasis on the samples' utility for the distillation task. We introduce and\nvalidate a family of data utility estimators and optimal data selection methods\nto exploit the most valuable samples. This strategy significantly reduces the\ntraining costs and extends various existing distillation algorithms to larger\nand more diversified datasets, e.g., in some cases only 0.04% training data is\nsufficient for comparable distillation performance. Our method consistently\nenhances the distillation algorithms, even on much larger-scale and more\nheterogeneous datasets, e.g. ImageNet-1K and Kinetics-400. This paradigm opens\nup new avenues in the dynamics of distillation and paves the way for efficient\ndataset distillation. Our code is available on\nhttps://github.com/silicx/GoldFromOres .\n","authors":["Yue Xu","Yong-Lu Li","Kaitong Cui","Ziyu Wang","Cewu Lu","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2305.18381v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17518v1","updated":"2023-11-29T10:40:52Z","published":"2023-11-29T10:40:52Z","title":"The devil is in the fine-grained details: Evaluating open-vocabulary\n object detectors for fine-grained understanding","summary":" Recent advancements in large vision-language models enabled visual object\ndetection in open-vocabulary scenarios, where object classes are defined in\nfree-text formats during inference. In this paper, we aim to probe the\nstate-of-the-art methods for open-vocabulary object detection to determine to\nwhat extent they understand fine-grained properties of objects and their parts.\nTo this end, we introduce an evaluation protocol based on dynamic vocabulary\ngeneration to test whether models detect, discern, and assign the correct\nfine-grained description to objects in the presence of hard-negative classes.\nWe contribute with a benchmark suite of increasing difficulty and probing\ndifferent properties like color, pattern, and material. We further enhance our\ninvestigation by evaluating several state-of-the-art open-vocabulary object\ndetectors using the proposed protocol and find that most existing solutions,\nwhich shine in standard open-vocabulary benchmarks, struggle to accurately\ncapture and distinguish finer object details. We conclude the paper by\nhighlighting the limitations of current methodologies and exploring promising\nresearch directions to overcome the discovered drawbacks. Data and code are\navailable at https://github.com/lorebianchi98/FG-OVD.\n","authors":["Lorenzo Bianchi","Fabio Carrara","Nicola Messina","Claudio Gennaro","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2311.17518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17516v1","updated":"2023-11-29T10:39:53Z","published":"2023-11-29T10:39:53Z","title":"MMA-Diffusion: MultiModal Attack on Diffusion Models","summary":" In recent years, Text-to-Image (T2I) models have seen remarkable\nadvancements, gaining widespread adoption. However, this progress has\ninadvertently opened avenues for potential misuse, particularly in generating\ninappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces\nMMA-Diffusion, a framework that presents a significant and realistic threat to\nthe security of T2I models by effectively circumventing current defensive\nmeasures in both open-source models and commercial online services. Unlike\nprevious approaches, MMA-Diffusion leverages both textual and visual modalities\nto bypass safeguards like prompt filters and post-hoc safety checkers, thus\nexposing and highlighting the vulnerabilities in existing defense mechanisms.\n","authors":["Yijun Yang","Ruiyuan Gao","Xiaosen Wang","Nan Xu","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2311.17516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17515v1","updated":"2023-11-29T10:38:42Z","published":"2023-11-29T10:38:42Z","title":"Fusion of Single and Integral Multispectral Aerial Images","summary":" We present a novel hybrid (model- and learning-based) architecture for fusing\nthe most significant features from conventional aerial images and integral\naerial images that result from synthetic aperture sensing for removing\nocclusion caused by dense vegetation. It combines the environment's spatial\nreferences with features of unoccluded targets. Our method out-beats the\nstate-of-the-art, does not require manually tuned parameters, can be extended\nto an arbitrary number and combinations of spectral channels, and is\nreconfigurable to address different use-cases.\n","authors":["Mohamed Youssef","Oliver Bimber"],"pdf_url":"https://arxiv.org/pdf/2311.17515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17510v1","updated":"2023-11-29T10:35:00Z","published":"2023-11-29T10:35:00Z","title":"StructRe: Rewriting for Structured Shape Modeling","summary":" Man-made 3D shapes are naturally organized in parts and hierarchies; such\nstructures provide important constraints for shape reconstruction and\ngeneration. Modeling shape structures is difficult, because there can be\nmultiple hierarchies for a given shape, causing ambiguity, and across different\ncategories the shape structures are correlated with semantics, limiting\ngeneralization. We present StructRe, a structure rewriting system, as a novel\napproach to structured shape modeling. Given a 3D object represented by points\nand components, StructRe can rewrite it upward into more concise structures, or\ndownward into more detailed structures; by iterating the rewriting process,\nhierarchies are obtained. Such a localized rewriting process enables\nprobabilistic modeling of ambiguous structures and robust generalization across\nobject categories. We train StructRe on PartNet data and show its\ngeneralization to cross-category and multiple object hierarchies, and test its\nextension to ShapeNet. We also demonstrate the benefits of probabilistic and\ngeneralizable structure modeling for shape reconstruction, generation and\nediting tasks.\n","authors":[" Wang"," Jiepeng"," Pan"," Hao"," Liu"," Yang"," Tong"," Xin"," Komura"," Taku"," Wang"," Wenping"],"pdf_url":"https://arxiv.org/pdf/2311.17510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17504v1","updated":"2023-11-29T10:27:56Z","published":"2023-11-29T10:27:56Z","title":"PViT-6D: Overclocking Vision Transformers for 6D Pose Estimation with\n Confidence-Level Prediction and Pose Tokens","summary":" In the current state of 6D pose estimation, top-performing techniques depend\non complex intermediate correspondences, specialized architectures, and\nnon-end-to-end algorithms. In contrast, our research reframes the problem as a\nstraightforward regression task by exploring the capabilities of Vision\nTransformers for direct 6D pose estimation through a tailored use of\nclassification tokens. We also introduce a simple method for determining pose\nconfidence, which can be readily integrated into most 6D pose estimation\nframeworks. This involves modifying the transformer architecture by decreasing\nthe number of query elements based on the network's assessment of the scene\ncomplexity. Our method that we call Pose Vision Transformer or PViT-6D provides\nthe benefits of simple implementation and being end-to-end learnable while\noutperforming current state-of-the-art methods by +0.3% ADD(-S) on\nLinemod-Occlusion and +2.7% ADD(-S) on the YCB-V dataset. Moreover, our method\nenhances both the model's interpretability and the reliability of its\nperformance during inference.\n","authors":["Sebastian Stapf","Tobias Bauernfeind","Marco Riboldi"],"pdf_url":"https://arxiv.org/pdf/2311.17504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03373v2","updated":"2023-11-29T10:21:48Z","published":"2023-04-06T21:00:00Z","title":"Training-Free Layout Control with Cross-Attention Guidance","summary":" Recent diffusion-based generators can produce high-quality images from\ntextual prompts. However, they often disregard textual instructions that\nspecify the spatial layout of the composition. We propose a simple approach\nthat achieves robust layout control without the need for training or\nfine-tuning of the image generator. Our technique manipulates the\ncross-attention layers that the model uses to interface textual and visual\ninformation and steers the generation in the desired direction given, e.g., a\nuser-specified layout. To determine how to best guide attention, we study the\nrole of attention maps and explore two alternative strategies, forward and\nbackward guidance. We thoroughly evaluate our approach on three benchmarks and\nprovide several qualitative examples and a comparative analysis of the two\nstrategies that demonstrate the superiority of backward guidance compared to\nforward guidance, as well as prior work. We further demonstrate the versatility\nof layout guidance by extending it to applications such as editing the layout\nand context of real images.\n","authors":["Minghao Chen","Iro Laina","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2304.03373v2.pdf","comment":"WACV 2024, Project Page:\n https://silent-chen.github.io/layout-guidance/"},{"id":"http://arxiv.org/abs/2203.00948v4","updated":"2023-11-29T10:17:09Z","published":"2022-03-02T08:58:06Z","title":"CD-GAN: a robust fusion-based generative adversarial network for\n unsupervised remote sensing change detection with heterogeneous sensors","summary":" In the context of Earth observation, change detection boils down to comparing\nimages acquired at different times by sensors of possibly different spatial\nand/or spectral resolutions or different modalities (e.g., optical or radar).\nEven when considering only optical images, this task has proven to be\nchallenging as soon as the sensors differ by their spatial and/or spectral\nresolutions. This paper proposes a novel unsupervised change detection method\ndedicated to images acquired by such so-called heterogeneous optical sensors.\nIt capitalizes on recent advances which formulate the change detection task\ninto a robust fusion framework. Adopting this formulation, the work reported in\nthis paper shows that any off-the-shelf network trained beforehand to fuse\noptical images of different spatial and/or spectral resolutions can be easily\ncomplemented with a network of the same architecture and embedded into an\nadversarial framework to perform change detection. A comparison with\nstate-of-the-art change detection methods demonstrates the versatility and the\neffectiveness of the proposed approach.\n","authors":["Jin-Ju Wang","Nicolas Dobigeon","Marie Chabert","Ding-Cheng Wang","Ting-Zhu Huang","Jie Huang"],"pdf_url":"https://arxiv.org/pdf/2203.00948v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00545v4","updated":"2023-11-29T10:10:04Z","published":"2023-01-02T07:13:28Z","title":"Knockoffs-SPR: Clean Sample Selection in Learning with Noisy Labels","summary":" A noisy training set usually leads to the degradation of the generalization\nand robustness of neural networks. In this paper, we propose a novel\ntheoretically guaranteed clean sample selection framework for learning with\nnoisy labels. Specifically, we first present a Scalable Penalized Regression\n(SPR) method, to model the linear relation between network features and one-hot\nlabels. In SPR, the clean data are identified by the zero mean-shift parameters\nsolved in the regression model. We theoretically show that SPR can recover\nclean data under some conditions. Under general scenarios, the conditions may\nbe no longer satisfied; and some noisy data are falsely selected as clean data.\nTo solve this problem, we propose a data-adaptive method for Scalable Penalized\nRegression with Knockoff filters (Knockoffs-SPR), which is provable to control\nthe False-Selection-Rate (FSR) in the selected clean data. To improve the\nefficiency, we further present a split algorithm that divides the whole\ntraining set into small pieces that can be solved in parallel to make the\nframework scalable to large datasets. While Knockoffs-SPR can be regarded as a\nsample selection module for a standard supervised training pipeline, we further\ncombine it with a semi-supervised algorithm to exploit the support of noisy\ndata as unlabeled data. Experimental results on several benchmark datasets and\nreal-world noisy datasets show the effectiveness of our framework and validate\nthe theoretical results of Knockoffs-SPR. Our code and pre-trained models are\navailable at https://github.com/Yikai-Wang/Knockoffs-SPR.\n","authors":["Yikai Wang","Yanwei Fu","Xinwei Sun"],"pdf_url":"https://arxiv.org/pdf/2301.00545v4.pdf","comment":"update: final version, to appear in TPAMI"},{"id":"http://arxiv.org/abs/2311.17493v1","updated":"2023-11-29T10:04:39Z","published":"2023-11-29T10:04:39Z","title":"Towards Higher Ranks via Adversarial Weight Pruning","summary":" Convolutional Neural Networks (CNNs) are hard to deploy on edge devices due\nto its high computation and storage complexities. As a common practice for\nmodel compression, network pruning consists of two major categories:\nunstructured and structured pruning, where unstructured pruning constantly\nperforms better. However, unstructured pruning presents a structured pattern at\nhigh pruning rates, which limits its performance. To this end, we propose a\nRank-based PruninG (RPG) method to maintain the ranks of sparse weights in an\nadversarial manner. In each step, we minimize the low-rank approximation error\nfor the weight matrices using singular value decomposition, and maximize their\ndistance by pushing the weight matrices away from its low rank approximation.\nThis rank-based optimization objective guides sparse weights towards a\nhigh-rank topology. The proposed method is conducted in a gradual pruning\nfashion to stabilize the change of rank during training. Experimental results\non various datasets and different tasks demonstrate the effectiveness of our\nalgorithm in high sparsity. The proposed RPG outperforms the state-of-the-art\nperformance by 1.13% top-1 accuracy on ImageNet in ResNet-50 with 98% sparsity.\nThe codes are available at\nhttps://github.com/huawei-noah/Efficient-Computing/tree/master/Pruning/RPG and\nhttps://gitee.com/mindspore/models/tree/master/research/cv/RPG.\n","authors":["Yuchuan Tian","Hanting Chen","Tianyu Guo","Chao Xu","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17493v1.pdf","comment":"NeurIPS 2023 Accepted"},{"id":"http://arxiv.org/abs/2311.08972v2","updated":"2023-11-29T09:57:06Z","published":"2023-11-15T14:04:37Z","title":"Unsupervised approaches based on optimal transport and convex analysis\n for inverse problems in imaging","summary":" Unsupervised deep learning approaches have recently become one of the crucial\nresearch areas in imaging owing to their ability to learn expressive and\npowerful reconstruction operators even when paired high-quality training data\nis scarcely available. In this chapter, we review theoretically principled\nunsupervised learning schemes for solving imaging inverse problems, with a\nparticular focus on methods rooted in optimal transport and convex analysis. We\nbegin by reviewing the optimal transport-based unsupervised approaches such as\nthe cycle-consistency-based models and learned adversarial regularization\nmethods, which have clear probabilistic interpretations. Subsequently, we give\nan overview of a recent line of works on provably convergent learned\noptimization algorithms applied to accelerate the solution of imaging inverse\nproblems, alongside their dedicated unsupervised training schemes. We also\nsurvey a number of provably convergent plug-and-play algorithms (based on\ngradient-step deep denoisers), which are among the most important and widely\napplied unsupervised approaches for imaging problems. At the end of this\nsurvey, we provide an overview of a few related unsupervised learning\nframeworks that complement our focused schemes. Together with a detailed\nsurvey, we provide an overview of the key mathematical results that underlie\nthe methods reviewed in the chapter to keep our discussion self-contained.\n","authors":["Marcello Carioni","Subhadip Mukherjee","Hong Ye Tan","Junqi Tang"],"pdf_url":"https://arxiv.org/pdf/2311.08972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17491v1","updated":"2023-11-29T09:55:13Z","published":"2023-11-29T09:55:13Z","title":"Spherical Frustum Sparse Convolution Network for LiDAR Point Cloud\n Semantic Segmentation","summary":" LiDAR point cloud semantic segmentation enables the robots to obtain\nfine-grained semantic information of the surrounding environment. Recently,\nmany works project the point cloud onto the 2D image and adopt the 2D\nConvolutional Neural Networks (CNNs) or vision transformer for LiDAR point\ncloud semantic segmentation. However, since more than one point can be\nprojected onto the same 2D position but only one point can be preserved, the\nprevious 2D image-based segmentation methods suffer from inevitable quantized\ninformation loss. To avoid quantized information loss, in this paper, we\npropose a novel spherical frustum structure. The points projected onto the same\n2D position are preserved in the spherical frustums. Moreover, we propose a\nmemory-efficient hash-based representation of spherical frustums. Through the\nhash-based representation, we propose the Spherical Frustum sparse Convolution\n(SFC) and Frustum Fast Point Sampling (F2PS) to convolve and sample the points\nstored in spherical frustums respectively. Finally, we present the Spherical\nFrustum sparse Convolution Network (SFCNet) to adopt 2D CNNs for LiDAR point\ncloud semantic segmentation without quantized information loss. Extensive\nexperiments on the SemanticKITTI and nuScenes datasets demonstrate that our\nSFCNet outperforms the 2D image-based semantic segmentation methods based on\nconventional spherical projection. The source code will be released later.\n","authors":["Yu Zheng","Guangming Wang","Jiuming Liu","Marc Pollefeys","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17491v1.pdf","comment":"17 pages, 10 figures, under review"},{"id":"http://arxiv.org/abs/2311.17486v1","updated":"2023-11-29T09:48:01Z","published":"2023-11-29T09:48:01Z","title":"Non-Visible Light Data Synthesis and Application: A Case Study for\n Synthetic Aperture Radar Imagery","summary":" We explore the \"hidden\" ability of large-scale pre-trained image generation\nmodels, such as Stable Diffusion and Imagen, in non-visible light domains,\ntaking Synthetic Aperture Radar (SAR) data for a case study. Due to the\ninherent challenges in capturing satellite data, acquiring ample SAR training\nsamples is infeasible. For instance, for a particular category of ship in the\nopen sea, we can collect only few-shot SAR images which are too limited to\nderive effective ship recognition models. If large-scale models pre-trained\nwith regular images can be adapted to generating novel SAR images, the problem\nis solved. In preliminary study, we found that fine-tuning these models with\nfew-shot SAR images is not working, as the models can not capture the two\nprimary differences between SAR and regular images: structure and modality. To\naddress this, we propose a 2-stage low-rank adaptation method, and we call it\n2LoRA. In the first stage, the model is adapted using aerial-view regular image\ndata (whose structure matches SAR), followed by the second stage where the base\nmodel from the first stage is further adapted using SAR modality data.\nParticularly in the second stage, we introduce a novel prototype LoRA (pLoRA),\nas an improved version of 2LoRA, to resolve the class imbalance problem in SAR\ndatasets. For evaluation, we employ the resulting generation model to\nsynthesize additional SAR data. This augmentation, when integrated into the\ntraining process of SAR classification as well as segmentation models, yields\nnotably improved performance for minor classes\n","authors":["Zichen Tian","Zhaozheng Chen","Qianru Sun"],"pdf_url":"https://arxiv.org/pdf/2311.17486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06255v3","updated":"2023-11-29T09:32:10Z","published":"2023-04-13T04:21:45Z","title":"SPColor: Semantic Prior Guided Exemplar-based Image Colorization","summary":" Exemplar-based image colorization aims to colorize a target grayscale image\nbased on a color reference image, and the key is to establish accurate\npixel-level semantic correspondence between these two images. Previous methods\nsearch for correspondence across the entire reference image, and this type of\nglobal matching is easy to get mismatch. We summarize the difficulties in two\naspects: (1) When the reference image only contains a part of objects related\nto target image, improper correspondence will be established in unrelated\nregions. (2) It is prone to get mismatch in regions where the shape or texture\nof the object is easily confused. To overcome these issues, we propose SPColor,\na semantic prior guided exemplar-based image colorization framework. Different\nfrom previous methods, SPColor first coarsely classifies pixels of the\nreference and target images to several pseudo-classes under the guidance of\nsemantic prior, then the correspondences are only established locally between\nthe pixels in the same class via the newly designed semantic prior guided\ncorrespondence network. In this way, improper correspondence between different\nsemantic classes is explicitly excluded, and the mismatch is obviously\nalleviated. Besides, to better reserve the color from reference, a similarity\nmasked perceptual loss is designed. Noting that the carefully designed SPColor\nutilizes the semantic prior provided by an unsupervised segmentation model,\nwhich is free for additional manual semantic annotations. Experiments\ndemonstrate that our model outperforms recent state-of-the-art methods both\nquantitatively and qualitatively on public dataset.\n","authors":["Siqi Chen","Xueming Li","Xianlin Zhang","Mingdao Wang","Yu Zhang","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06255v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17475v1","updated":"2023-11-29T09:31:31Z","published":"2023-11-29T09:31:31Z","title":"CLiSA: A Hierarchical Hybrid Transformer Model using Orthogonal Cross\n Attention for Satellite Image Cloud Segmentation","summary":" Clouds in optical satellite images are a major concern since their presence\nhinders the ability to carry accurate analysis as well as processing. Presence\nof clouds also affects the image tasking schedule and results in wastage of\nvaluable storage space on ground as well as space-based systems. Due to these\nreasons, deriving accurate cloud masks from optical remote-sensing images is an\nimportant task. Traditional methods such as threshold-based, spatial filtering\nfor cloud detection in satellite images suffer from lack of accuracy. In recent\nyears, deep learning algorithms have emerged as a promising approach to solve\nimage segmentation problems as it allows pixel-level classification and\nsemantic-level segmentation. In this paper, we introduce a deep-learning model\nbased on hybrid transformer architecture for effective cloud mask generation\nnamed CLiSA - Cloud segmentation via Lipschitz Stable Attention network. In\nthis context, we propose an concept of orthogonal self-attention combined with\nhierarchical cross attention model, and we validate its Lipschitz stability\ntheoretically and empirically. We design the whole setup under adversarial\nsetting in presence of Lov\\'asz-Softmax loss. We demonstrate both qualitative\nand quantitative outcomes for multiple satellite image datasets including\nLandsat-8, Sentinel-2, and Cartosat-2s. Performing comparative study we show\nthat our model performs preferably against other state-of-the-art methods and\nalso provides better generalization in precise cloud extraction from satellite\nmulti-spectral (MX) images. We also showcase different ablation studies to\nendorse our choices corresponding to different architectural elements and\nobjective functions.\n","authors":["Subhajit Paul","Ashutosh Gupta"],"pdf_url":"https://arxiv.org/pdf/2311.17475v1.pdf","comment":"14 pages, 11 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.17466v1","updated":"2023-11-29T09:18:39Z","published":"2023-11-29T09:18:39Z","title":"Slot-Mixup with Subsampling: A Simple Regularization for WSI\n Classification","summary":" Whole slide image (WSI) classification requires repetitive zoom-in and out\nfor pathologists, as only small portions of the slide may be relevant to\ndetecting cancer. Due to the lack of patch-level labels, multiple instance\nlearning (MIL) is a common practice for training a WSI classifier. One of the\nchallenges in MIL for WSIs is the weak supervision coming only from the\nslide-level labels, often resulting in severe overfitting. In response,\nresearchers have considered adopting patch-level augmentation or applying mixup\naugmentation, but their applicability remains unverified. Our approach augments\nthe training dataset by sampling a subset of patches in the WSI without\nsignificantly altering the underlying semantics of the original slides.\nAdditionally, we introduce an efficient model (Slot-MIL) that organizes patches\ninto a fixed number of slots, the abstract representation of patches, using an\nattention mechanism. We empirically demonstrate that the subsampling\naugmentation helps to make more informative slots by restricting the\nover-concentration of attention and to improve interpretability. Finally, we\nillustrate that combining our attention-based aggregation model with\nsubsampling and mixup, which has shown limited compatibility in existing MIL\nmethods, can enhance both generalization and calibration. Our proposed methods\nachieve the state-of-the-art performance across various benchmark datasets\nincluding class imbalance and distribution shifts.\n","authors":["Seongho Keum","Sanghyun Kim","Soojeong Lee","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2311.17466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17465v1","updated":"2023-11-29T09:13:00Z","published":"2023-11-29T09:13:00Z","title":"AgentAvatar: Disentangling Planning, Driving and Rendering for\n Photorealistic Avatar Agents","summary":" In this study, our goal is to create interactive avatar agents that can\nautonomously plan and animate nuanced facial movements realistically, from both\nvisual and behavioral perspectives. Given high-level inputs about the\nenvironment and agent profile, our framework harnesses LLMs to produce a series\nof detailed text descriptions of the avatar agents' facial motions. These\ndescriptions are then processed by our task-agnostic driving engine into motion\ntoken sequences, which are subsequently converted into continuous motion\nembeddings that are further consumed by our standalone neural-based renderer to\ngenerate the final photorealistic avatar animations. These streamlined\nprocesses allow our framework to adapt to a variety of non-verbal avatar\ninteractions, both monadic and dyadic. Our extensive study, which includes\nexperiments on both newly compiled and existing datasets featuring two types of\nagents -- one capable of monadic interaction with the environment, and the\nother designed for dyadic conversation -- validates the effectiveness and\nversatility of our approach. To our knowledge, we advanced a leap step by\ncombining LLMs and neural rendering for generalized non-verbal prediction and\nphoto-realistic rendering of avatar agents.\n","authors":["Duomin Wang","Bin Dai","Yu Deng","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17465v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2311.17461v1","updated":"2023-11-29T09:05:14Z","published":"2023-11-29T09:05:14Z","title":"When StyleGAN Meets Stable Diffusion: a $\\mathscr{W}_+$ Adapter for\n Personalized Image Generation","summary":" Text-to-image diffusion models have remarkably excelled in producing diverse,\nhigh-quality, and photo-realistic images. This advancement has spurred a\ngrowing interest in incorporating specific identities into generated content.\nMost current methods employ an inversion approach to embed a target visual\nconcept into the text embedding space using a single reference image. However,\nthe newly synthesized faces either closely resemble the reference image in\nterms of facial attributes, such as expression, or exhibit a reduced capacity\nfor identity preservation. Text descriptions intended to guide the facial\nattributes of the synthesized face may fall short, owing to the intricate\nentanglement of identity information with identity-irrelevant facial attributes\nderived from the reference image. To address these issues, we present the novel\nuse of the extended StyleGAN embedding space $\\mathcal{W}_+$, to achieve\nenhanced identity preservation and disentanglement for diffusion models. By\naligning this semantically meaningful human face latent space with\ntext-to-image diffusion models, we succeed in maintaining high fidelity in\nidentity preservation, coupled with the capacity for semantic editing.\nAdditionally, we propose new training objectives to balance the influences of\nboth prompt and identity conditions, ensuring that the identity-irrelevant\nbackground remains unaffected during facial attribute modifications. Extensive\nexperiments reveal that our method adeptly generates personalized text-to-image\noutputs that are not only compatible with prompt descriptions but also amenable\nto common StyleGAN editing directions in diverse settings. Our source code will\nbe available at \\url{https://github.com/csxmli2016/w-plus-adapter}.\n","authors":["Xiaoming Li","Xinyu Hou","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2311.17461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17460v1","updated":"2023-11-29T09:02:07Z","published":"2023-11-29T09:02:07Z","title":"W-HMR: Human Mesh Recovery in World Space with Weak-supervised Camera\n Calibration and Orientation Correction","summary":" For a long time, in the field of reconstructing 3D human bodies from\nmonocular images, most methods opted to simplify the task by minimizing the\ninfluence of the camera. Using a coarse focal length setting results in the\nreconstructed bodies not aligning well with distorted images. Ignoring camera\nrotation leads to an unrealistic reconstructed body pose in world space.\nConsequently, existing methods' application scenarios are confined to\ncontrolled environments. And they struggle to achieve accurate and reasonable\nreconstruction in world space when confronted with complex and diverse\nin-the-wild images. To address the above issues, we propose W-HMR, which\ndecouples global body recovery into camera calibration, local body recovery and\nglobal body orientation correction. We design the first weak-supervised camera\ncalibration method for body distortion, eliminating dependence on focal length\nlabels and achieving finer mesh-image alignment. We propose a novel orientation\ncorrection module to allow the reconstructed human body to remain normal in\nworld space. Decoupling body orientation and body pose enables our model to\nconsider the accuracy in camera coordinate and the reasonableness in world\ncoordinate simultaneously, expanding the range of applications. As a result,\nW-HMR achieves high-quality reconstruction in dual coordinate systems,\nparticularly in challenging scenes. Codes will be released on\nhttps://yw0208.github.io/ after publication.\n","authors":["Wei Yao","Hongwen Zhang","Yunlian Sun","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2311.17460v1.pdf","comment":"Project Page: https://yw0208.github.io"},{"id":"http://arxiv.org/abs/2305.07598v4","updated":"2023-11-29T08:56:29Z","published":"2023-05-12T16:42:54Z","title":"Hausdorff Distance Matching with Adaptive Query Denoising for Rotated\n Detection Transformer","summary":" The Detection Transformer (DETR) has emerged as a pivotal role in object\ndetection tasks, setting new performance benchmarks due to its end-to-end\ndesign and scalability. Despite its advancements, the application of DETR in\ndetecting rotated objects has demonstrated suboptimal performance relative to\nestablished oriented object detectors. Our analysis identifies a key\nlimitation: the L1 cost used in Hungarian Matching leads to duplicate\npredictions due to the square-like problem in oriented object detection,\nthereby obstructing the training process of the detector. We introduce a\nHausdorff distance-based cost for Hungarian matching, which more accurately\nquantifies the discrepancy between predictions and ground truths. Moreover, we\nnote that a static denoising approach hampers the training of rotated DETR,\nparticularly when the detector's predictions surpass the quality of noised\nground truths. We propose an adaptive query denoising technique, employing\nHungarian matching to selectively filter out superfluous noised queries that no\nlonger contribute to model improvement. Our proposed modifications to DETR have\nresulted in superior performance, surpassing previous rotated DETR models and\nother alternatives. This is evidenced by our model's state-of-the-art\nachievements in benchmarks such as DOTA-v1.0/v1.5/v2.0, and DIOR-R.\n","authors":["Hakjin Lee","Minki Song","Jamyoung Koo","Junghoon Seo"],"pdf_url":"https://arxiv.org/pdf/2305.07598v4.pdf","comment":"Under review, 16 pages, 12 tables, 8 figures"},{"id":"http://arxiv.org/abs/2311.17456v1","updated":"2023-11-29T08:56:24Z","published":"2023-11-29T08:56:24Z","title":"DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with\n Diffusion Model","summary":" Scene flow estimation, which aims to predict per-point 3D displacements of\ndynamic scenes, is a fundamental task in the computer vision field. However,\nprevious works commonly suffer from unreliable correlation caused by locally\nconstrained searching ranges, and struggle with accumulated inaccuracy arising\nfrom the coarse-to-fine structure. To alleviate these problems, we propose a\nnovel uncertainty-aware scene flow estimation network (DifFlow3D) with the\ndiffusion probabilistic model. Iterative diffusion-based refinement is designed\nto enhance the correlation robustness and resilience to challenging cases,\ne.g., dynamics, noisy inputs, repetitive patterns, etc. To restrain the\ngeneration diversity, three key flow-related features are leveraged as\nconditions in our diffusion model. Furthermore, we also develop an uncertainty\nestimation module within diffusion to evaluate the reliability of estimated\nscene flow. Our DifFlow3D achieves state-of-the-art performance, with 6.7\\% and\n19.1\\% EPE3D reduction respectively on FlyingThings3D and KITTI 2015 datasets.\nNotably, our method achieves an unprecedented millimeter-level accuracy\n(0.0089m in EPE3D) on the KITTI dataset. Additionally, our diffusion-based\nrefinement paradigm can be readily integrated as a plug-and-play module into\nexisting scene flow networks, significantly increasing their estimation\naccuracy. Codes will be released later.\n","authors":["Jiuming Liu","Guangming Wang","Weicai Ye","Chaokang Jiang","Jinru Han","Zhe Liu","Guofeng Zhang","Dalong Du","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17450v1","updated":"2023-11-29T08:46:46Z","published":"2023-11-29T08:46:46Z","title":"Continual Learning for Image Segmentation with Dynamic Query","summary":" Image segmentation based on continual learning exhibits a critical drop of\nperformance, mainly due to catastrophic forgetting and background shift, as\nthey are required to incorporate new classes continually. In this paper, we\npropose a simple, yet effective Continual Image Segmentation method with\nincremental Dynamic Query (CISDQ), which decouples the representation learning\nof both old and new knowledge with lightweight query embedding. CISDQ mainly\nincludes three contributions: 1) We define dynamic queries with adaptive\nbackground class to exploit past knowledge and learn future classes naturally.\n2) CISDQ proposes a class/instance-aware Query Guided Knowledge Distillation\nstrategy to overcome catastrophic forgetting by capturing the inter-class\ndiversity and intra-class identity. 3) Apart from semantic segmentation, CISDQ\nintroduce the continual learning for instance segmentation in which\ninstance-wise labeling and supervision are considered. Extensive experiments on\nthree datasets for two tasks (i.e., continual semantic and instance\nsegmentation are conducted to demonstrate that CISDQ achieves the\nstate-of-the-art performance, specifically, obtaining 4.4% and 2.9% mIoU\nimprovements for the ADE 100-10 (6 steps) setting and ADE 100-5 (11 steps)\nsetting.\n","authors":["Weijia Wu","Yuzhong Zhao","Zhuang Li","Lianlei Shan","Hong Zhou","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2311.17450v1.pdf","comment":"Code: https://github.com/weijiawu/CisDQ"},{"id":"http://arxiv.org/abs/2311.17449v1","updated":"2023-11-29T08:43:04Z","published":"2023-11-29T08:43:04Z","title":"Weakly-semi-supervised object detection in remotely sensed imagery","summary":" Deep learning for detecting objects in remotely sensed imagery can enable new\ntechnologies for important applications including mitigating climate change.\nHowever, these models often require large datasets labeled with bounding box\nannotations which are expensive to curate, prohibiting the development of\nmodels for new tasks and geographies. To address this challenge, we develop\nweakly-semi-supervised object detection (WSSOD) models on remotely sensed\nimagery which can leverage a small amount of bounding boxes together with a\nlarge amount of point labels that are easy to acquire at scale in geospatial\ndata. We train WSSOD models which use large amounts of point-labeled images\nwith varying fractions of bounding box labeled images in FAIR1M and a wind\nturbine detection dataset, and demonstrate that they substantially outperform\nfully supervised models trained with the same amount of bounding box labeled\nimages on both datasets. Furthermore, we find that the WSSOD models trained\nwith 2-10x fewer bounding box labeled images can perform similarly to or\noutperform fully supervised models trained on the full set of bounding-box\nlabeled images. We believe that the approach can be extended to other remote\nsensing tasks to reduce reliance on bounding box labels and increase\ndevelopment of models for impactful applications.\n","authors":["Ji Hun Wang","Jeremy Irvin","Beri Kohen Behar","Ha Tran","Raghav Samavedam","Quentin Hsu","Andrew Y. Ng"],"pdf_url":"https://arxiv.org/pdf/2311.17449v1.pdf","comment":"Tackling Climate Change with Machine Learning at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.20208v2","updated":"2023-11-29T08:33:30Z","published":"2023-10-31T06:11:23Z","title":"ZoomNeXt: A Unified Collaborative Pyramid Network for Camouflaged Object\n Detection","summary":" Recent camouflaged object detection (COD) attempts to segment objects\nvisually blended into their surroundings, which is extremely complex and\ndifficult in real-world scenarios. Apart from the high intrinsic similarity\nbetween camouflaged objects and their background, objects are usually diverse\nin scale, fuzzy in appearance, and even severely occluded. To this end, we\npropose an effective unified collaborative pyramid network which mimics human\nbehavior when observing vague images and videos, \\textit{i.e.}, zooming in and\nout. Specifically, our approach employs the zooming strategy to learn\ndiscriminative mixed-scale semantics by the multi-head scale integration and\nrich granularity perception units, which are designed to fully explore\nimperceptible clues between candidate objects and background surroundings. The\nformer's intrinsic multi-head aggregation provides more diverse visual\npatterns. The latter's routing mechanism can effectively propagate inter-frame\ndifference in spatiotemporal scenarios and adaptively ignore static\nrepresentations. They provides a solid foundation for realizing a unified\narchitecture for static and dynamic COD. Moreover, considering the uncertainty\nand ambiguity derived from indistinguishable textures, we construct a simple\nyet effective regularization, uncertainty awareness loss, to encourage\npredictions with higher confidence in candidate regions. Our highly\ntask-friendly framework consistently outperforms existing state-of-the-art\nmethods in image and video COD benchmarks. The code will be available at\n\\url{https://github.com/lartpang/ZoomNeXt}.\n","authors":["Youwei Pang","Xiaoqi Zhao","Tian-Zhu Xiang","Lihe Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2310.20208v2.pdf","comment":"Extensions to the conference version: arXiv:2203.02688; Fixed some\n word errors"},{"id":"http://arxiv.org/abs/2311.17435v1","updated":"2023-11-29T08:27:00Z","published":"2023-11-29T08:27:00Z","title":"MM-Narrator: Narrating Long-form Videos with Multimodal In-Context\n Learning","summary":" We present MM-Narrator, a novel system leveraging GPT-4 with multimodal\nin-context learning for the generation of audio descriptions (AD). Unlike\nprevious methods that primarily focused on downstream fine-tuning with short\nvideo clips, MM-Narrator excels in generating precise audio descriptions for\nvideos of extensive lengths, even beyond hours, in an autoregressive manner.\nThis capability is made possible by the proposed memory-augmented generation\nprocess, which effectively utilizes both the short-term textual context and\nlong-term visual memory through an efficient register-and-recall mechanism.\nThese contextual memories compile pertinent past information, including\nstorylines and character identities, ensuring an accurate tracking and\ndepicting of story-coherent and character-centric audio descriptions.\nMaintaining the training-free design of MM-Narrator, we further propose a\ncomplexity-based demonstration selection strategy to largely enhance its\nmulti-step reasoning capability via few-shot multimodal in-context learning\n(MM-ICL). Experimental results on MAD-eval dataset demonstrate that MM-Narrator\nconsistently outperforms both the existing fine-tuning-based approaches and\nLLM-based approaches in most scenarios, as measured by standard evaluation\nmetrics. Additionally, we introduce the first segment-based evaluator for\nrecurrent text generation. Empowered by GPT-4, this evaluator comprehensively\nreasons and marks AD generation performance in various extendable dimensions.\n","authors":["Chaoyi Zhang","Kevin Lin","Zhengyuan Yang","Jianfeng Wang","Linjie Li","Chung-Ching Lin","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17435v1.pdf","comment":"Project page at https://mm-narrator.github.io/"},{"id":"http://arxiv.org/abs/2311.17434v1","updated":"2023-11-29T08:26:18Z","published":"2023-11-29T08:26:18Z","title":"Group-wise Sparse and Explainable Adversarial Attacks","summary":" Sparse adversarial attacks fool deep neural networks (DNNs) through minimal\npixel perturbations, typically regularized by the $\\ell_0$ norm. Recent efforts\nhave replaced this norm with a structural sparsity regularizer, such as the\nnuclear group norm, to craft group-wise sparse adversarial attacks. The\nresulting perturbations are thus explainable and hold significant practical\nrelevance, shedding light on an even greater vulnerability of DNNs than\npreviously anticipated. However, crafting such attacks poses an optimization\nchallenge, as it involves computing norms for groups of pixels within a\nnon-convex objective. In this paper, we tackle this challenge by presenting an\nalgorithm that simultaneously generates group-wise sparse attacks within\nsemantically meaningful areas of an image. In each iteration, the core\noperation of our algorithm involves the optimization of a quasinorm adversarial\nloss. This optimization is achieved by employing the $1/2$-quasinorm proximal\noperator for some iterations, a method tailored for nonconvex programming.\nSubsequently, the algorithm transitions to a projected Nesterov's accelerated\ngradient descent with $2$-norm regularization applied to perturbation\nmagnitudes. We rigorously evaluate the efficacy of our novel attack in both\ntargeted and non-targeted attack scenarios, on CIFAR-10 and ImageNet datasets.\nWhen compared to state-of-the-art methods, our attack consistently results in a\nremarkable increase in group-wise sparsity, e.g., an increase of $48.12\\%$ on\nCIFAR-10 and $40.78\\%$ on ImageNet (average case, targeted attack), all while\nmaintaining lower perturbation magnitudes. Notably, this performance is\ncomplemented by a significantly faster computation time and a $100\\%$ attack\nsuccess rate.\n","authors":["Shpresim Sadiku","Moritz Wagner","Sebastian Pokutta"],"pdf_url":"https://arxiv.org/pdf/2311.17434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16133v2","updated":"2023-11-29T08:24:57Z","published":"2023-11-02T13:14:01Z","title":"Effective Quantization for Diffusion Models on CPUs","summary":" Diffusion models have gained popularity for generating images from textual\ndescriptions. Nonetheless, the substantial need for computational resources\ncontinues to present a noteworthy challenge, contributing to time-consuming\nprocesses. Quantization, a technique employed to compress deep learning models\nfor enhanced efficiency, presents challenges when applied to diffusion models.\nThese models are notably more sensitive to quantization compared to other model\ntypes, potentially resulting in a degradation of image quality. In this paper,\nwe introduce a novel approach to quantize the diffusion models by leveraging\nboth quantization-aware training and distillation. Our results show the\nquantized models can maintain the high image quality while demonstrating the\ninference efficiency on CPUs. The code is publicly available at:\nhttps://github.com/intel/intel-extension-for-transformers.\n","authors":["Hanwen Chang","Haihao Shen","Yiyang Cai","Xinyu Ye","Zhenzhong Xu","Wenhua Cheng","Kaokao Lv","Weiwei Zhang","Yintong Lu","Heng Guo"],"pdf_url":"https://arxiv.org/pdf/2311.16133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13187v2","updated":"2023-11-29T08:22:11Z","published":"2023-11-22T06:28:30Z","title":"NeISF: Neural Incident Stokes Field for Geometry and Material Estimation","summary":" Multi-view inverse rendering is the problem of estimating the scene\nparameters such as shapes, materials, or illuminations from a sequence of\nimages captured under different viewpoints. Many approaches, however, assume\nsingle light bounce and thus fail to recover challenging scenarios like\ninter-reflections. On the other hand, simply extending those methods to\nconsider multi-bounced light requires more assumptions to alleviate the\nambiguity. To address this problem, we propose Neural Incident Stokes Fields\n(NeISF), a multi-view inverse rendering framework that reduces ambiguities\nusing polarization cues. The primary motivation for using polarization cues is\nthat it is the accumulation of multi-bounced light, providing rich information\nabout geometry and material. Based on this knowledge, the proposed incident\nStokes field efficiently models the accumulated polarization effect with the\naid of an original physically-based differentiable polarimetric renderer.\nLastly, experimental results show that our method outperforms the existing\nworks in synthetic and real scenarios.\n","authors":["Chenhao Li","Taishi Ono","Takeshi Uemori","Hajime Mihara","Alexander Gatto","Hajime Nagahara","Yusuke Moriuchi"],"pdf_url":"https://arxiv.org/pdf/2311.13187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05189v4","updated":"2023-11-29T08:18:14Z","published":"2023-05-09T05:48:38Z","title":"SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with\n Large Language Models","summary":" Diffusion models, which have emerged to become popular text-to-image\ngeneration models, can produce high-quality and content-rich images guided by\ntextual prompts. However, there are limitations to semantic understanding and\ncommonsense reasoning in existing models when the input prompts are concise\nnarrative, resulting in low-quality image generation. To improve the capacities\nfor narrative prompts, we propose a simple-yet-effective parameter-efficient\nfine-tuning approach called the Semantic Understanding and Reasoning adapter\n(SUR-adapter) for pre-trained diffusion models. To reach this goal, we first\ncollect and annotate a new dataset SURD which consists of more than 57,000\nsemantically corrected multi-modal samples. Each sample contains a simple\nnarrative prompt, a complex keyword-based prompt, and a high-quality image.\nThen, we align the semantic representation of narrative prompts to the complex\nprompts and transfer knowledge of large language models (LLMs) to our\nSUR-adapter via knowledge distillation so that it can acquire the powerful\nsemantic understanding and reasoning capabilities to build a high-quality\ntextual semantic representation for text-to-image generation. We conduct\nexperiments by integrating multiple LLMs and popular pre-trained diffusion\nmodels to show the effectiveness of our approach in enabling diffusion models\nto understand and reason concise natural language without image quality\ndegradation. Our approach can make text-to-image diffusion models easier to use\nwith better user experience, which demonstrates our approach has the potential\nfor further advancing the development of user-friendly text-to-image generation\nmodels by bridging the semantic gap between simple narrative prompts and\ncomplex keyword-based prompts. The code is released at\nhttps://github.com/Qrange-group/SUR-adapter.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Wushao Wen","Jinghui Qin","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.05189v4.pdf","comment":"accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2311.17428v1","updated":"2023-11-29T08:09:01Z","published":"2023-11-29T08:09:01Z","title":"SigFormer: Sparse Signal-Guided Transformer for Multi-Modal Human Action\n Segmentation","summary":" Multi-modal human action segmentation is a critical and challenging task with\na wide range of applications. Nowadays, the majority of approaches concentrate\non the fusion of dense signals (i.e., RGB, optical flow, and depth maps).\nHowever, the potential contributions of sparse IoT sensor signals, which can be\ncrucial for achieving accurate recognition, have not been fully explored. To\nmake up for this, we introduce a Sparse signalguided Transformer (SigFormer) to\ncombine both dense and sparse signals. We employ mask attention to fuse\nlocalized features by constraining cross-attention within the regions where\nsparse signals are valid. However, since sparse signals are discrete, they lack\nsufficient information about the temporal action boundaries. Therefore, in\nSigFormer, we propose to emphasize the boundary information at two stages to\nalleviate this problem. In the first feature extraction stage, we introduce an\nintermediate bottleneck module to jointly learn both category and boundary\nfeatures of each dense modality through the inner loss functions. After the\nfusion of dense modalities and sparse signals, we then devise a two-branch\narchitecture that explicitly models the interrelationship between action\ncategory and temporal boundary. Experimental results demonstrate that SigFormer\noutperforms the state-of-the-art approaches on a multi-modal action\nsegmentation dataset from real industrial environments, reaching an outstanding\nF1 score of 0.958. The codes and pre-trained models have been available at\nhttps://github.com/LIUQI-creat/SigFormer.\n","authors":["Qi Liu","Xinchen Liu","Kun Liu","Xiaoyan Gu","Wu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17425v1","updated":"2023-11-29T07:57:30Z","published":"2023-11-29T07:57:30Z","title":"SpeechAct: Towards Generating Whole-body Motion from Speech","summary":" This paper addresses the problem of generating whole-body motion from speech.\nDespite great successes, prior methods still struggle to produce reasonable and\ndiverse whole-body motions from speech. This is due to their reliance on\nsuboptimal representations and a lack of strategies for generating diverse\nresults. To address these challenges, we present a novel hybrid point\nrepresentation to achieve accurate and continuous motion generation, e.g.,\navoiding foot skating, and this representation can be transformed into an\neasy-to-use representation, i.e., SMPL-X body mesh, for many applications. To\ngenerate whole-body motion from speech, for facial motion, closely tied to the\naudio signal, we introduce an encoder-decoder architecture to achieve\ndeterministic outcomes. However, for the body and hands, which have weaker\nconnections to the audio signal, we aim to generate diverse yet reasonable\nmotions. To boost diversity in motion generation, we propose a contrastive\nmotion learning method to encourage the model to produce more distinctive\nrepresentations. Specifically, we design a robust VQ-VAE to learn a quantized\nmotion codebook using our hybrid representation. Then, we regress the motion\nrepresentation from the audio signal by a translation model employing our\ncontrastive motion learning method. Experimental results validate the superior\nperformance and the correctness of our model. The project page is available for\nresearch purposes at http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct.\n","authors":["Jinsong Zhang","Minjie Zhu","Yuxiang Zhang","Yebin Liu","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2311.17425v1.pdf","comment":"Project page: http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct"},{"id":"http://arxiv.org/abs/2311.08100v2","updated":"2023-11-29T07:53:47Z","published":"2023-11-14T11:53:24Z","title":"DeepEMplanner: An End-to-End EM Motion Planner with Iterative\n Interactions","summary":" Motion planning is a computational problem that finds a sequence of valid\ntrajectories, often based on surrounding agents' forecasting, environmental\nunderstanding, and historical and future contexts. It can also be viewed as a\ngame in which agents continuously plan their next move according to other\nagents' intentions and the encountering environment, further achieving their\nultimate goals through incremental actions. To model the dynamic planning and\ninteraction process, we propose a novel framework, DeepEMplanner, which takes\nthe stepwise interaction into account for fine-grained behavior learning. The\nego vehicle maximizes each step motion to reach its eventual driving outcome\nbased on the stepwise expectation from agents and its upcoming road conditions.\nOn the other hand, the agents also follow the same philosophy to maximize their\nstepwise behavior under the encountering environment and the expectations from\nego and other agents. Our DeepEMplanner models the interactions among ego,\nagents, and the dynamic environment in an autoregressive manner by interleaving\nthe Expectation and Maximization processes. Further, we design ego-to-agents,\nego-to-map, and ego-to-BEV interaction mechanisms with hierarchical dynamic key\nobjects attention to better model the interactions. Experiments on the nuScenes\nbenchmark show that our approach achieves state-of-the-art results.\n","authors":["Zhili Chen","Maosheng Ye","Shuangjie Xu","Tongyi Cao","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2311.08100v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14327v3","updated":"2023-11-29T07:52:18Z","published":"2023-09-25T17:53:29Z","title":"DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via\n Multi-Modal Causal Attention","summary":" Most of the existing multi-modal models, hindered by their incapacity to\nadeptly manage interleaved image-and-text inputs in multi-image, multi-round\ndialogues, face substantial constraints in resource allocation for training and\ndata accessibility, impacting their adaptability and scalability across varied\ninteraction realms. To address this, we present the DeepSpeed-VisualChat\nframework, designed to optimize Large Language Models (LLMs) by incorporating\nmulti-modal capabilities, with a focus on enhancing the proficiency of Large\nVision and Language Models in handling interleaved inputs. Our framework is\nnotable for (1) its open-source support for multi-round and multi-image\ndialogues, (2) introducing an innovative multi-modal causal attention\nmechanism, and (3) utilizing data blending techniques on existing datasets to\nassure seamless interactions in multi-round, multi-image conversations.\nCompared to existing frameworks, DeepSpeed-VisualChat shows superior\nscalability up to 70B parameter language model size, representing a significant\nadvancement in multi-modal language models and setting a solid foundation for\nfuture explorations.\n","authors":["Zhewei Yao","Xiaoxia Wu","Conglong Li","Minjia Zhang","Heyang Qin","Olatunji Ruwase","Ammar Ahmad Awan","Samyam Rajbhandari","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2309.14327v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18297v3","updated":"2023-11-29T07:51:36Z","published":"2023-10-27T17:35:01Z","title":"Image Clustering Conditioned on Text Criteria","summary":" Classical clustering methods do not provide users with direct control of the\nclustering results, and the clustering results may not be consistent with the\nrelevant criterion that a user has in mind. In this work, we present a new\nmethodology for performing image clustering based on user-specified text\ncriteria by leveraging modern vision-language models and large language models.\nWe call our method Image Clustering Conditioned on Text Criteria (IC|TC), and\nit represents a different paradigm of image clustering. IC|TC requires a\nminimal and practical degree of human intervention and grants the user\nsignificant control over the clustering results in return. Our experiments show\nthat IC|TC can effectively cluster images with various criteria, such as human\naction, physical location, or the person's mood, while significantly\noutperforming baselines.\n","authors":["Sehyun Kwon","Jaeseung Park","Minkyu Kim","Jaewoong Cho","Ernest K. Ryu","Kangwook Lee"],"pdf_url":"https://arxiv.org/pdf/2310.18297v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17409v1","updated":"2023-11-29T07:29:26Z","published":"2023-11-29T07:29:26Z","title":"Talking Head(?) Anime from a Single Image 4: Improved Model and Its\n Distillation","summary":" We study the problem of creating a character model that can be controlled in\nreal time from a single image of an anime character. A solution to this problem\nwould greatly reduce the cost of creating avatars, computer games, and other\ninteractive applications.\n Talking Head Anime 3 (THA3) is an open source project that attempts to\ndirectly addresses the problem. It takes as input (1) an image of an anime\ncharacter's upper body and (2) a 45-dimensional pose vector and outputs a new\nimage of the same character taking the specified pose. The range of possible\nmovements is expressive enough for personal avatars and certain types of game\ncharacters. However, the system is too slow to generate animations in real time\non common PCs, and its image quality can be improved.\n In this paper, we improve THA3 in two ways. First, we propose new\narchitectures for constituent networks that rotate the character's head and\nbody based on U-Nets with attention that are widely used in modern generative\nmodels. The new architectures consistently yield better image quality than the\nTHA3 baseline. Nevertheless, they also make the whole system much slower: it\ntakes up to 150 milliseconds to generate a frame. Second, we propose a\ntechnique to distill the system into a small network (less than 2 MB) that can\ngenerate 512x512 animation frames in real time (under 30 FPS) using consumer\ngaming GPUs while keeping the image quality close to that of the full system.\nThis improvement makes the whole system practical for real-time applications.\n","authors":["Pramook Khungurn"],"pdf_url":"https://arxiv.org/pdf/2311.17409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17408v1","updated":"2023-11-29T07:25:49Z","published":"2023-11-29T07:25:49Z","title":"Dynamic Dense Graph Convolutional Network for Skeleton-based Human\n Motion Prediction","summary":" Graph Convolutional Networks (GCN) which typically follows a neural message\npassing framework to model dependencies among skeletal joints has achieved high\nsuccess in skeleton-based human motion prediction task. Nevertheless, how to\nconstruct a graph from a skeleton sequence and how to perform message passing\non the graph are still open problems, which severely affect the performance of\nGCN. To solve both problems, this paper presents a Dynamic Dense Graph\nConvolutional Network (DD-GCN), which constructs a dense graph and implements\nan integrated dynamic message passing. More specifically, we construct a dense\ngraph with 4D adjacency modeling as a comprehensive representation of motion\nsequence at different levels of abstraction. Based on the dense graph, we\npropose a dynamic message passing framework that learns dynamically from data\nto generate distinctive messages reflecting sample-specific relevance among\nnodes in the graph. Extensive experiments on benchmark Human 3.6M and CMU Mocap\ndatasets verify the effectiveness of our DD-GCN which obviously outperforms\nstate-of-the-art GCN-based methods, especially when using long-term and our\nproposed extremely long-term protocol.\n","authors":["Xinshun Wang","Wanying Zhang","Can Wang","Yuan Gao","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17404v1","updated":"2023-11-29T07:15:34Z","published":"2023-11-29T07:15:34Z","title":"VITATECS: A Diagnostic Dataset for Temporal Concept Understanding of\n Video-Language Models","summary":" The ability to perceive how objects change over time is a crucial ingredient\nin human intelligence. However, current benchmarks cannot faithfully reflect\nthe temporal understanding abilities of video-language models (VidLMs) due to\nthe existence of static visual shortcuts. To remedy this issue, we present\nVITATECS, a diagnostic VIdeo-Text dAtaset for the evaluation of TEmporal\nConcept underStanding. Specifically, we first introduce a fine-grained taxonomy\nof temporal concepts in natural language in order to diagnose the capability of\nVidLMs to comprehend different temporal aspects. Furthermore, to disentangle\nthe correlation between static and temporal information, we generate\ncounterfactual video descriptions that differ from the original one only in the\nspecified temporal aspect. We employ a semi-automatic data collection framework\nusing large language models and human-in-the-loop annotation to obtain\nhigh-quality counterfactual descriptions efficiently. Evaluation of\nrepresentative video-language understanding models confirms their deficiency in\ntemporal understanding, revealing the need for greater emphasis on the temporal\nelements in video-language research.\n","authors":["Shicheng Li","Lei Li","Shuhuai Ren","Yuanxin Liu","Yi Liu","Rundong Gao","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2311.17404v1.pdf","comment":"23 pages, 6 figures, 18 tables, data is available at\n https://github.com/lscpku/VITATECS"},{"id":"http://arxiv.org/abs/2310.06389v2","updated":"2023-11-29T07:09:08Z","published":"2023-10-10T07:52:30Z","title":"Learning Stackable and Skippable LEGO Bricks for Efficient,\n Reconfigurable, and Variable-Resolution Diffusion Modeling","summary":" Diffusion models excel at generating photo-realistic images but come with\nsignificant computational costs in both training and sampling. While various\ntechniques address these computational challenges, a less-explored issue is\ndesigning an efficient and adaptable network backbone for iterative refinement.\nCurrent options like U-Net and Vision Transformer often rely on\nresource-intensive deep networks and lack the flexibility needed for generating\nimages at variable resolutions or with a smaller network than used in training.\nThis study introduces LEGO bricks, which seamlessly integrate Local-feature\nEnrichment and Global-content Orchestration. These bricks can be stacked to\ncreate a test-time reconfigurable diffusion backbone, allowing selective\nskipping of bricks to reduce sampling costs and generate higher-resolution\nimages than the training data. LEGO bricks enrich local regions with an MLP and\ntransform them using a Transformer block while maintaining a consistent\nfull-resolution image across all bricks. Experimental results demonstrate that\nLEGO bricks enhance training efficiency, expedite convergence, and facilitate\nvariable-resolution image generation while maintaining strong generative\nperformance. Moreover, LEGO significantly reduces sampling time compared to\nother methods, establishing it as a valuable enhancement for diffusion models.\n","authors":["Huangjie Zheng","Zhendong Wang","Jianbo Yuan","Guanghan Ning","Pengcheng He","Quanzeng You","Hongxia Yang","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.06389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17396v1","updated":"2023-11-29T06:53:23Z","published":"2023-11-29T06:53:23Z","title":"Spectral and Polarization Vision: Spectro-polarimetric Real-world\n Dataset","summary":" Image datasets are essential not only in validating existing methods in\ncomputer vision but also in developing new methods. Most existing image\ndatasets focus on trichromatic intensity images to mimic human vision. However,\npolarization and spectrum, the wave properties of light that animals in harsh\nenvironments and with limited brain capacity often rely on, remain\nunderrepresented in existing datasets. Although spectro-polarimetric datasets\nexist, these datasets have insufficient object diversity, limited illumination\nconditions, linear-only polarization data, and inadequate image count. Here, we\nintroduce two spectro-polarimetric datasets: trichromatic Stokes images and\nhyperspectral Stokes images. These novel datasets encompass both linear and\ncircular polarization; they introduce multiple spectral channels; and they\nfeature a broad selection of real-world scenes. With our dataset in hand, we\nanalyze the spectro-polarimetric image statistics, develop efficient\nrepresentations of such high-dimensional data, and evaluate spectral dependency\nof shape-from-polarization methods. As such, the proposed dataset promises a\nfoundation for data-driven spectro-polarimetric imaging and vision research.\nDataset and code will be publicly available.\n","authors":["Yujin Jeon","Eunsue Choi","Youngchan Kim","Yunseong Moon","Khalid Omer","Felix Heide","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2311.17396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05382v3","updated":"2023-11-29T06:49:12Z","published":"2023-06-08T17:31:24Z","title":"Image Blending Algorithm with Automatic Mask Generation","summary":" In recent years, image blending has gained popularity for its ability to\ncreate visually stunning content. However, the current image blending\nalgorithms mainly have the following problems: manually creating image blending\nmasks requires a lot of manpower and material resources; image blending\nalgorithms cannot effectively solve the problems of brightness distortion and\nlow resolution. To this end, we propose a new image blending method with\nautomatic mask generation: it combines semantic object detection and\nsegmentation with mask generation to achieve deep blended images based on our\nproposed new saturation loss and two-stage iteration of the PAN algorithm to\nfix brightness distortion and low-resolution issues. Results on publicly\navailable datasets show that our method outperforms other classical image\nblending algorithms on various performance metrics, including PSNR and SSIM.\n","authors":["Haochen Xue","Mingyu Jin","Chong Zhang","Yuxuan Huang","Qian Weng","Xiaobo Jin"],"pdf_url":"https://arxiv.org/pdf/2306.05382v3.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.17389v1","updated":"2023-11-29T06:42:12Z","published":"2023-11-29T06:42:12Z","title":"360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization\n with Cross-device Queries","summary":" Portable 360$^\\circ$ cameras are becoming a cheap and efficient tool to\nestablish large visual databases. By capturing omnidirectional views of a\nscene, these cameras could expedite building environment models that are\nessential for visual localization. However, such an advantage is often\noverlooked due to the lack of valuable datasets. This paper introduces a new\nbenchmark dataset, 360Loc, composed of 360$^\\circ$ images with ground truth\nposes for visual localization. We present a practical implementation of\n360$^\\circ$ mapping combining 360$^\\circ$ images with lidar data to generate\nthe ground truth 6DoF poses. 360Loc is the first dataset and benchmark that\nexplores the challenge of cross-device visual positioning, involving\n360$^\\circ$ reference frames, and query frames from pinhole, ultra-wide FoV\nfisheye, and 360$^\\circ$ cameras. We propose a virtual camera approach to\ngenerate lower-FoV query frames from 360$^\\circ$ images, which ensures a fair\ncomparison of performance among different query types in visual localization\ntasks. We also extend this virtual camera approach to feature matching-based\nand pose regression-based methods to alleviate the performance loss caused by\nthe cross-device domain gap, and evaluate its effectiveness against\nstate-of-the-art baselines. We demonstrate that omnidirectional visual\nlocalization is more robust in challenging large-scale scenes with symmetries\nand repetitive structures. These results provide new insights into 360-camera\nmapping and omnidirectional visual localization with cross-device queries.\n","authors":["Huajian Huang","Changkun Liu","Yipeng Zhu","Hui Cheng","Tristan Braud","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.17389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01552v2","updated":"2023-11-29T06:40:01Z","published":"2023-04-04T06:06:59Z","title":"Meta-Learning with a Geometry-Adaptive Preconditioner","summary":" Model-agnostic meta-learning (MAML) is one of the most successful\nmeta-learning algorithms. It has a bi-level optimization structure where the\nouter-loop process learns a shared initialization and the inner-loop process\noptimizes task-specific weights. Although MAML relies on the standard gradient\ndescent in the inner-loop, recent studies have shown that controlling the\ninner-loop's gradient descent with a meta-learned preconditioner can be\nbeneficial. Existing preconditioners, however, cannot simultaneously adapt in a\ntask-specific and path-dependent way. Additionally, they do not satisfy the\nRiemannian metric condition, which can enable the steepest descent learning\nwith preconditioned gradient. In this study, we propose Geometry-Adaptive\nPreconditioned gradient descent (GAP) that can overcome the limitations in\nMAML; GAP can efficiently meta-learn a preconditioner that is dependent on\ntask-specific parameters, and its preconditioner can be shown to be a\nRiemannian metric. Thanks to the two properties, the geometry-adaptive\npreconditioner is effective for improving the inner-loop optimization.\nExperiment results show that GAP outperforms the state-of-the-art MAML family\nand preconditioned gradient descent-MAML (PGD-MAML) family in a variety of\nfew-shot learning tasks. Code is available at:\nhttps://github.com/Suhyun777/CVPR23-GAP.\n","authors":["Suhyun Kang","Duhun Hwang","Moonjung Eo","Taesup Kim","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2304.01552v2.pdf","comment":"Accepted at CVPR 2023. Code is available at:\n https://github.com/Suhyun777/CVPR23-GAP; This is an extended version of our\n previous CVPR23 work"},{"id":"http://arxiv.org/abs/2304.00450v3","updated":"2023-11-29T06:05:47Z","published":"2023-04-02T05:05:58Z","title":"Sketch-based Video Object Localization","summary":" We introduce Sketch-based Video Object Localization (SVOL), a new task aimed\nat localizing spatio-temporal object boxes in video queried by the input\nsketch. We first outline the challenges in the SVOL task and build the\nSketch-Video Attention Network (SVANet) with the following design principles:\n(i) to consider temporal information of video and bridge the domain gap between\nsketch and video; (ii) to accurately identify and localize multiple objects\nsimultaneously; (iii) to handle various styles of sketches; (iv) to be\nclassification-free. In particular, SVANet is equipped with a Cross-modal\nTransformer that models the interaction between learnable object tokens, query\nsketch, and video through attention operations, and learns upon a per-frame set\nmatching strategy that enables frame-wise prediction while utilizing global\nvideo context. We evaluate SVANet on a newly curated SVOL dataset. By design,\nSVANet successfully learns the mapping between the query sketches and video\nobjects, achieving state-of-the-art results on the SVOL benchmark. We further\nconfirm the effectiveness of SVANet via extensive ablation studies and\nvisualizations. Lastly, we demonstrate its transfer capability on unseen\ndatasets and novel categories, suggesting its high scalability in real-world\napplications.\n","authors":["Sangmin Woo","So-Yeong Jeon","Jinyoung Park","Minji Son","Sumin Lee","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2304.00450v3.pdf","comment":"WACV 2024; Code: https://github.com/sangminwoo/SVOL"},{"id":"http://arxiv.org/abs/2311.16444v2","updated":"2023-11-29T06:01:34Z","published":"2023-11-28T02:51:13Z","title":"Exo2EgoDVC: Dense Video Captioning of Egocentric Procedural Activities\n Using Web Instructional Videos","summary":" We propose a novel benchmark for cross-view knowledge transfer of dense video\ncaptioning, adapting models from web instructional videos with exocentric views\nto an egocentric view. While dense video captioning (predicting time segments\nand their captions) is primarily studied with exocentric videos (e.g.,\nYouCook2), benchmarks with egocentric videos are restricted due to data\nscarcity. To overcome the limited video availability, transferring knowledge\nfrom abundant exocentric web videos is demanded as a practical approach.\nHowever, learning the correspondence between exocentric and egocentric views is\ndifficult due to their dynamic view changes. The web videos contain mixed views\nfocusing on either human body actions or close-up hand-object interactions,\nwhile the egocentric view is constantly shifting as the camera wearer moves.\nThis necessitates the in-depth study of cross-view transfer under complex view\nchanges. In this work, we first create a real-life egocentric dataset (EgoYC2)\nwhose captions are shared with YouCook2, enabling transfer learning between\nthese datasets assuming their ground-truth is accessible. To bridge the view\ngaps, we propose a view-invariant learning method using adversarial training in\nboth the pre-training and fine-tuning stages. While the pre-training is\ndesigned to learn invariant features against the mixed views in the web videos,\nthe view-invariant fine-tuning further mitigates the view gaps between both\ndatasets. We validate our proposed method by studying how effectively it\novercomes the view change problem and efficiently transfers the knowledge to\nthe egocentric domain. Our benchmark pushes the study of the cross-view\ntransfer into a new task domain of dense video captioning and will envision\nmethodologies to describe egocentric videos in natural language.\n","authors":["Takehiko Ohkawa","Takuma Yagi","Taichi Nishimura","Ryosuke Furuta","Atsushi Hashimoto","Yoshitaka Ushiku","Yoichi Sato"],"pdf_url":"https://arxiv.org/pdf/2311.16444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10743v3","updated":"2023-11-29T05:58:16Z","published":"2023-08-21T14:16:36Z","title":"Enhancing Adversarial Attacks: The Similar Target Method","summary":" Deep neural networks are vulnerable to adversarial examples, posing a threat\nto the models' applications and raising security concerns. An intriguing\nproperty of adversarial examples is their strong transferability. Several\nmethods have been proposed to enhance transferability, including ensemble\nattacks which have demonstrated their efficacy. However, prior approaches\nsimply average logits, probabilities, or losses for model ensembling, lacking a\ncomprehensive analysis of how and why model ensembling significantly improves\ntransferability. In this paper, we propose a similar targeted attack method\nnamed Similar Target~(ST). By promoting cosine similarity between the gradients\nof each model, our method regularizes the optimization direction to\nsimultaneously attack all surrogate models. This strategy has been proven to\nenhance generalization ability. Experimental results on ImageNet validate the\neffectiveness of our approach in improving adversarial transferability. Our\nmethod outperforms state-of-the-art attackers on 18 discriminative classifiers\nand adversarially trained models.\n","authors":["Shuo Zhang","Ziruo Wang","Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.10743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09257v4","updated":"2023-11-29T05:55:16Z","published":"2023-11-14T23:07:50Z","title":"UFOGen: You Forward Once Large Scale Text-to-Image Generation via\n Diffusion GANs","summary":" Text-to-image diffusion models have demonstrated remarkable capabilities in\ntransforming textual prompts into coherent images, yet the computational cost\nof their inference remains a persistent challenge. To address this issue, we\npresent UFOGen, a novel generative model designed for ultra-fast, one-step\ntext-to-image synthesis. In contrast to conventional approaches that focus on\nimproving samplers or employing distillation techniques for diffusion models,\nUFOGen adopts a hybrid methodology, integrating diffusion models with a GAN\nobjective. Leveraging a newly introduced diffusion-GAN objective and\ninitialization with pre-trained diffusion models, UFOGen excels in efficiently\ngenerating high-quality images conditioned on textual descriptions in a single\nstep. Beyond traditional text-to-image generation, UFOGen showcases versatility\nin applications. Notably, UFOGen stands among the pioneering models enabling\none-step text-to-image generation and diverse downstream tasks, presenting a\nsignificant advancement in the landscape of efficient generative models.\n","authors":["Yanwu Xu","Yang Zhao","Zhisheng Xiao","Tingbo Hou"],"pdf_url":"https://arxiv.org/pdf/2311.09257v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17368v1","updated":"2023-11-29T05:42:25Z","published":"2023-11-29T05:42:25Z","title":"Two Scalable Approaches for Burned-Area Mapping Using U-Net and Landsat\n Imagery","summary":" Monitoring wildfires is an essential step in minimizing their impact on the\nplanet, understanding the many negative environmental, economic, and social\nconsequences. Recent advances in remote sensing technology combined with the\nincreasing application of artificial intelligence methods have improved\nreal-time, high-resolution fire monitoring. This study explores two proposed\napproaches based on the U-Net model for automating and optimizing the\nburned-area mapping process. Denoted 128 and AllSizes (AS), they are trained on\ndatasets with a different class balance by cropping input images to different\nsizes. They are then applied to Landsat imagery and time-series data from two\nfire-prone regions in Chile. The results obtained after enhancement of model\nperformance by hyperparameter optimization demonstrate the effectiveness of\nboth approaches. Tests based on 195 representative images of the study area\nshow that increasing dataset balance using the AS model yields better\nperformance. More specifically, AS exhibited a Dice Coefficient (DC) of 0.93,\nan Omission Error (OE) of 0.086, and a Commission Error (CE) of 0.045, while\nthe 128 model achieved a DC of 0.86, an OE of 0.12, and a CE of 0.12. These\nfindings should provide a basis for further development of scalable automatic\nburned-area mapping tools.\n","authors":["Ian Mancilla-Wulff","Jaime Carrasco","Cristobal Pais","Alejandro Miranda","Andres Weintraub"],"pdf_url":"https://arxiv.org/pdf/2311.17368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04760v2","updated":"2023-11-29T05:35:26Z","published":"2023-07-10T17:58:17Z","title":"Learning Spatial Features from Audio-Visual Correspondence in Egocentric\n Videos","summary":" We propose a self-supervised method for learning representations based on\nspatial audio-visual correspondences in egocentric videos. Our method uses a\nmasked auto-encoding framework to synthesize masked binaural (multi-channel)\naudio through the synergy of audio and vision, thereby learning useful spatial\nrelationships between the two modalities. We use our pretrained features to\ntackle two downstream video tasks requiring spatial understanding in social\nscenarios: active speaker detection and spatial audio denoising. Through\nextensive experiments, we show that our features are generic enough to improve\nover multiple state-of-the-art baselines on both tasks on two challenging\negocentric video datasets that offer binaural audio, EgoCom and EasyCom.\nProject: http://vision.cs.utexas.edu/projects/ego_av_corr.\n","authors":["Sagnik Majumder","Ziad Al-Halah","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.04760v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06842v5","updated":"2023-11-29T05:32:33Z","published":"2023-03-13T04:16:42Z","title":"Hierarchical Relationships: A New Perspective to Enhance Scene Graph\n Generation","summary":" This paper presents a finding that leveraging the hierarchical structures\namong labels for relationships and objects can substantially improve the\nperformance of scene graph generation systems. The focus of this work is to\ncreate an informative hierarchical structure that can divide object and\nrelationship categories into disjoint super-categories in a systematic way.\nSpecifically, we introduce a Bayesian prediction head to jointly predict the\nsuper-category of relationships between a pair of object instances, as well as\nthe detailed relationship within that super-category simultaneously,\nfacilitating more informative predictions. The resulting model exhibits the\ncapability to produce a more extensive set of predicates beyond the dataset\nannotations, and to tackle the prevalent issue of low annotation quality. While\nour paper presents preliminary findings, experiments on the Visual Genome\ndataset show its strong performance, particularly in predicate classifications\nand zero-shot settings, that demonstrates the promise of our approach.\n","authors":["Bowen Jiang","Camillo J. Taylor"],"pdf_url":"https://arxiv.org/pdf/2303.06842v5.pdf","comment":"NeurIPS 2023 New Frontiers in Graph Learning Workshop (NeurIPS\n GLFrontiers 2023); NeurIPS 2023 Queer in AI Workshop. This paper is a\n preliminary work of the full paper available at arXiv:2311.12889"},{"id":"http://arxiv.org/abs/2310.18348v3","updated":"2023-11-29T05:32:24Z","published":"2023-10-23T04:35:58Z","title":"Meaning Representations from Trajectories in Autoregressive Models","summary":" We propose to extract meaning representations from autoregressive language\nmodels by considering the distribution of all possible trajectories extending\nan input text. This strategy is prompt-free, does not require fine-tuning, and\nis applicable to any pre-trained autoregressive model. Moreover, unlike\nvector-based representations, distribution-based representations can also model\nasymmetric relations (e.g., direction of logical entailment, hypernym/hyponym\nrelations) by using algebraic operations between likelihood functions. These\nideas are grounded in distributional perspectives on semantics and are\nconnected to standard constructions in automata theory, but to our knowledge\nthey have not been applied to modern language models. We empirically show that\nthe representations obtained from large models align well with human\nannotations, outperform other zero-shot and prompt-free methods on semantic\nsimilarity tasks, and can be used to solve more complex entailment and\ncontainment tasks that standard embeddings cannot handle. Finally, we extend\nour method to represent data from different modalities (e.g., image and text)\nusing multimodal autoregressive models. Our code is available at:\nhttps://github.com/tianyu139/meaning-as-trajectories\n","authors":["Tian Yu Liu","Matthew Trager","Alessandro Achille","Pramuditha Perera","Luca Zancato","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2310.18348v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17366v1","updated":"2023-11-29T05:28:39Z","published":"2023-11-29T05:28:39Z","title":"Generative Hierarchical Temporal Transformer for Hand Action Recognition\n and Motion Prediction","summary":" We present a novel framework that concurrently tackles hand action\nrecognition and 3D future hand motion prediction. While previous works focus on\neither recognition or prediction, we propose a generative Transformer VAE\narchitecture to jointly capture both aspects, facilitating realistic motion\nprediction by leveraging the short-term hand motion and long-term action\nconsistency observed across timestamps.To ensure faithful representation of the\nsemantic dependency and different temporal granularity of hand pose and action,\nour framework is decomposed into two cascaded VAE blocks. The lower pose block\nmodels short-span poses, while the upper action block models long-span action.\nThese are connected by a mid-level feature that represents sub-second series of\nhand poses.Our framework is trained across multiple datasets, where pose and\naction blocks are trained separately to fully utilize pose-action annotations\nof different qualities. Evaluations show that on multiple datasets, the joint\nmodeling of recognition and prediction improves over separate solutions, and\nthe semantic and temporal hierarchy enables long-term pose and action modeling.\n","authors":["Yilin Wen","Hao Pan","Takehiko Ohkawa","Lei Yang","Jia Pan","Yoichi Sato","Taku Komura","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17365v1","updated":"2023-11-29T05:27:14Z","published":"2023-11-29T05:27:14Z","title":"Symbol-LLM: Leverage Language Models for Symbolic System in Visual Human\n Activity Reasoning","summary":" Human reasoning can be understood as a cooperation between the intuitive,\nassociative \"System-1\" and the deliberative, logical \"System-2\". For existing\nSystem-1-like methods in visual activity understanding, it is crucial to\nintegrate System-2 processing to improve explainability, generalization, and\ndata efficiency. One possible path of activity reasoning is building a symbolic\nsystem composed of symbols and rules, where one rule connects multiple symbols,\nimplying human knowledge and reasoning abilities. Previous methods have made\nprogress, but are defective with limited symbols from handcraft and limited\nrules from visual-based annotations, failing to cover the complex patterns of\nactivities and lacking compositional generalization. To overcome the defects,\nwe propose a new symbolic system with two ideal important properties:\nbroad-coverage symbols and rational rules. Collecting massive human knowledge\nvia manual annotations is expensive to instantiate this symbolic system.\nInstead, we leverage the recent advancement of LLMs (Large Language Models) as\nan approximation of the two ideal properties, i.e., Symbols from Large Language\nModels (Symbol-LLM). Then, given an image, visual contents from the images are\nextracted and checked as symbols and activity semantics are reasoned out based\non rules via fuzzy logic calculation. Our method shows superiority in extensive\nactivity understanding tasks. Code and data are available at\nhttps://mvig-rhos.com/symbol_llm.\n","authors":["Xiaoqian Wu","Yong-Lu Li","Jianhua Sun","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2311.17365v1.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17361v1","updated":"2023-11-29T05:20:10Z","published":"2023-11-29T05:20:10Z","title":"How does spatial structure affect psychological restoration? A method\n based on Graph Neural Networks and Street View Imagery","summary":" The Attention Restoration Theory (ART) presents a theoretical framework with\nfour essential indicators (being away, extent, fascinating, and compatibility)\nfor comprehending urban and natural restoration quality. However, previous\nstudies relied on non-sequential data and non-spatial dependent methods, which\noverlooks the impact of spatial structure defined here as the positional\nrelationships between scene entities on restoration quality. The past methods\nalso make it challenging to measure restoration quality on an urban scale. In\nthis work, a spatial-dependent graph neural networks (GNNs) approach is\nproposed to reveal the relation between spatial structure and restoration\nquality on an urban scale. Specifically, we constructed two different types of\ngraphs at the street and city levels. The street-level graphs, using sequential\nstreet view images (SVIs) of road segments to capture position relationships\nbetween entities, were used to represent spatial structure. The city-level\ngraph, modeling the topological relationships of roads as non-Euclidean data\nstructures and embedding urban features (including Perception-features,\nSpatial-features, and Socioeconomic-features), was used to measure restoration\nquality. The results demonstrate that: 1) spatial-dependent GNNs model\noutperforms traditional methods (Acc = 0.735, F1 = 0.732); 2) spatial structure\nportrayed through sequential SVIs data significantly influences restoration\nquality; 3) spaces with the same restoration quality exhibited distinct spatial\nstructures patterns. This study clarifies the association between spatial\nstructure and restoration quality, providing a new perspective to improve urban\nwell-being in the future.\n","authors":["Haoran Ma","Yan Zhang","Pengyuan Liu","Fan Zhang","Pengyu Zhua"],"pdf_url":"https://arxiv.org/pdf/2311.17361v1.pdf","comment":"33 pages, 7 figures, Under review"},{"id":"http://arxiv.org/abs/2311.17354v1","updated":"2023-11-29T05:00:43Z","published":"2023-11-29T05:00:43Z","title":"A natural language processing-based approach: mapping human perception\n by understanding deep semantic features in street view images","summary":" In the past decade, using Street View images and machine learning to measure\nhuman perception has become a mainstream research approach in urban science.\nHowever, this approach using only image-shallow information makes it difficult\nto comprehensively understand the deep semantic features of human perception of\na scene. In this study, we proposed a new framework based on a pre-train\nnatural language model to understand the relationship between human perception\nand the sense of a scene. Firstly, Place Pulse 2.0 was used as our base\ndataset, which contains a variety of human-perceived labels, namely, beautiful,\nsafe, wealthy, depressing, boring, and lively. An image captioning network was\nused to extract the description information of each street view image.\nSecondly, a pre-trained BERT model was finetuning and added a regression\nfunction for six human perceptual dimensions. Furthermore, we compared the\nperformance of five traditional regression methods with our approach and\nconducted a migration experiment in Hong Kong. Our results show that human\nperception scoring by deep semantic features performed better than previous\nstudies by machine learning methods with shallow features. The use of deep\nscene semantic features provides new ideas for subsequent human perception\nresearch, as well as better explanatory power in the face of spatial\nheterogeneity.\n","authors":["Haoran Ma","Dongdong Wu"],"pdf_url":"https://arxiv.org/pdf/2311.17354v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.00688v2","updated":"2023-11-29T04:44:30Z","published":"2023-08-01T17:45:13Z","title":"AnyLoc: Towards Universal Visual Place Recognition","summary":" Visual Place Recognition (VPR) is vital for robot localization. To date, the\nmost performant VPR approaches are environment- and task-specific: while they\nexhibit strong performance in structured environments (predominantly urban\ndriving), their performance degrades severely in unstructured environments,\nrendering most approaches brittle to robust real-world deployment. In this\nwork, we develop a universal solution to VPR -- a technique that works across a\nbroad range of structured and unstructured environments (urban, outdoors,\nindoors, aerial, underwater, and subterranean environments) without any\nre-training or fine-tuning. We demonstrate that general-purpose feature\nrepresentations derived from off-the-shelf self-supervised models with no\nVPR-specific training are the right substrate upon which to build such a\nuniversal VPR solution. Combining these derived features with unsupervised\nfeature aggregation enables our suite of methods, AnyLoc, to achieve up to 4X\nsignificantly higher performance than existing approaches. We further obtain a\n6% improvement in performance by characterizing the semantic properties of\nthese features, uncovering unique domains which encapsulate datasets from\nsimilar environments. Our detailed experiments and analysis lay a foundation\nfor building VPR solutions that may be deployed anywhere, anytime, and across\nanyview. We encourage the readers to explore our project page and interactive\ndemos: https://anyloc.github.io/.\n","authors":["Nikhil Keetha","Avneesh Mishra","Jay Karhade","Krishna Murthy Jatavallabhula","Sebastian Scherer","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2308.00688v2.pdf","comment":"IEEE RA-L 2023 (Presented at ICRA 2024)"},{"id":"http://arxiv.org/abs/2311.17352v1","updated":"2023-11-29T04:31:35Z","published":"2023-11-29T04:31:35Z","title":"Efficient Stitchable Task Adaptation","summary":" The paradigm of pre-training and fine-tuning has laid the foundation for\ndeploying deep learning models. However, most fine-tuning methods are designed\nto meet a specific resource budget. Recently, considering diverse deployment\nscenarios with various resource budgets, stitchable neural network (SN-Net) is\nintroduced to quickly obtain numerous new networks (stitches) from the\npre-trained models (anchors) in a model family via model stitching. Although\npromising, SN-Net confronts new challenges when adapting it to new target\ndomains, including huge memory and storage requirements and a long and\nsub-optimal multistage adaptation process. In this work, we present a novel\nframework, Efficient Stitchable Task Adaptation (ESTA), to efficiently produce\na palette of fine-tuned models that adhere to diverse resource constraints.\nSpecifically, we first tailor parameter-efficient fine-tuning to share low-rank\nupdates among the stitches while maintaining independent bias terms. In this\nway, we largely reduce fine-tuning memory burdens and mitigate the interference\namong stitches that arises in task adaptation. Furthermore, we streamline a\nsimple yet effective one-stage deployment pipeline, which estimates the\nimportant stitches to deploy with training-time gradient statistics. By\nassigning higher sampling probabilities to important stitches, we also get a\nboosted Pareto frontier. Extensive experiments on 25 downstream visual\nrecognition tasks demonstrate that our ESTA is capable of generating stitches\nwith smooth accuracy-efficiency trade-offs and surpasses the direct SN-Net\nadaptation by remarkable margins with significantly lower training time and\nfewer trainable parameters. Furthermore, we demonstrate the flexibility and\nscalability of our ESTA framework by stitching LLMs from LLaMA family,\nobtaining chatbot stitches of assorted sizes.\n","authors":["Haoyu He","Zizheng Pan","Jing Liu","Jianfei Cai","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2311.17352v1.pdf","comment":"Source code will be released at\n https://github.com/ziplab/Stitched_LLaMA"},{"id":"http://arxiv.org/abs/2311.17350v1","updated":"2023-11-29T04:15:57Z","published":"2023-11-29T04:15:57Z","title":"Implicit-explicit Integrated Representations for Multi-view Video\n Compression","summary":" With the increasing consumption of 3D displays and virtual reality,\nmulti-view video has become a promising format. However, its high resolution\nand multi-camera shooting result in a substantial increase in data volume,\nmaking storage and transmission a challenging task. To tackle these\ndifficulties, we propose an implicit-explicit integrated representation for\nmulti-view video compression. Specifically, we first use the explicit\nrepresentation-based 2D video codec to encode one of the source views.\nSubsequently, we propose employing the implicit neural representation\n(INR)-based codec to encode the remaining views. The implicit codec takes the\ntime and view index of multi-view video as coordinate inputs and generates the\ncorresponding implicit reconstruction frames.To enhance the compressibility, we\nintroduce a multi-level feature grid embedding and a fully convolutional\narchitecture into the implicit codec. These components facilitate\ncoordinate-feature and feature-RGB mapping, respectively. To further enhance\nthe reconstruction quality from the INR codec, we leverage the high-quality\nreconstructed frames from the explicit codec to achieve inter-view\ncompensation. Finally, the compensated results are fused with the implicit\nreconstructions from the INR to obtain the final reconstructed frames. Our\nproposed framework combines the strengths of both implicit neural\nrepresentation and explicit 2D codec. Extensive experiments conducted on public\ndatasets demonstrate that the proposed framework can achieve comparable or even\nsuperior performance to the latest multi-view video compression standard MIV\nand other INR-based schemes in terms of view compression and scene modeling.\n","authors":["Chen Zhu","Guo Lu","Bing He","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2311.17350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10720v5","updated":"2023-11-29T04:00:00Z","published":"2023-06-19T06:41:19Z","title":"Exploring the Relationship between Samples and Masks for Robust Defect\n Localization","summary":" Defect detection aims to detect and localize regions out of the normal\ndistribution.Previous approaches model normality and compare it with the input\nto identify defective regions, potentially limiting their generalizability.This\npaper proposes a one-stage framework that detects defective patterns directly\nwithout the modeling process.This ability is adopted through the joint efforts\nof three parties: a generative adversarial network (GAN), a newly proposed\nscaled pattern loss, and a dynamic masked cycle-consistent auxiliary network.\nExplicit information that could indicate the position of defects is\nintentionally excluded to avoid learning any direct mapping.Experimental\nresults on the texture class of the challenging MVTec AD dataset show that the\nproposed method is 2.9% higher than the SOTA methods in F1-Score, while\nsubstantially outperforming SOTA methods in generalizability.\n","authors":["Jiang Lin","Yaping Yan"],"pdf_url":"https://arxiv.org/pdf/2306.10720v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.07737v4","updated":"2023-11-29T03:43:13Z","published":"2021-02-15T18:34:38Z","title":"Zero-Shot Self-Supervised Learning for MRI Reconstruction","summary":" Deep learning (DL) has emerged as a powerful tool for accelerated MRI\nreconstruction, but often necessitates a database of fully-sampled measurements\nfor training. Recent self-supervised and unsupervised learning approaches\nenable training without fully-sampled data. However, a database of undersampled\nmeasurements may not be available in many scenarios, especially for scans\ninvolving contrast or translational acquisitions in development. Moreover,\nrecent studies show that database-trained models may not generalize well when\nthe unseen measurements differ in terms of sampling pattern, acceleration rate,\nSNR, image contrast, and anatomy. Such challenges necessitate a new methodology\nto enable subject-specific DL MRI reconstruction without external training\ndatasets, since it is clinically imperative to provide high-quality\nreconstructions that can be used to identify lesions/disease for \\emph{every\nindividual}. In this work, we propose a zero-shot self-supervised learning\napproach to perform subject-specific accelerated DL MRI reconstruction to\ntackle these issues. The proposed approach partitions the available\nmeasurements from a single scan into three disjoint sets. Two of these sets are\nused to enforce data consistency and define loss during training for\nself-supervision, while the last set serves to self-validate, establishing an\nearly stopping criterion. In the presence of models pre-trained on a database\nwith different image characteristics, we show that the proposed approach can be\ncombined with transfer learning for faster convergence time and reduced\ncomputational complexity. The code is available at\n\\url{https://github.com/byaman14/ZS-SSL}.\n","authors":["Burhaneddin Yaman","Seyed Amir Hossein Hosseini","Mehmet Akçakaya"],"pdf_url":"https://arxiv.org/pdf/2102.07737v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17340v1","updated":"2023-11-29T03:38:56Z","published":"2023-11-29T03:38:56Z","title":"Cross-Scope Spatial-Spectral Information Aggregation for Hyperspectral\n Image Super-Resolution","summary":" Hyperspectral image super-resolution has attained widespread prominence to\nenhance the spatial resolution of hyperspectral images. However,\nconvolution-based methods have encountered challenges in harnessing the global\nspatial-spectral information. The prevailing transformer-based methods have not\nadequately captured the long-range dependencies in both spectral and spatial\ndimensions. To alleviate this issue, we propose a novel cross-scope\nspatial-spectral Transformer (CST) to efficiently investigate long-range\nspatial and spectral similarities for single hyperspectral image\nsuper-resolution. Specifically, we devise cross-attention mechanisms in spatial\nand spectral dimensions to comprehensively model the long-range\nspatial-spectral characteristics. By integrating global information into the\nrectangle-window self-attention, we first design a cross-scope spatial\nself-attention to facilitate long-range spatial interactions. Then, by\nleveraging appropriately characteristic spatial-spectral features, we construct\na cross-scope spectral self-attention to effectively capture the intrinsic\ncorrelations among global spectral bands. Finally, we elaborate a concise\nfeed-forward neural network to enhance the feature representation capacity in\nthe Transformer structure. Extensive experiments over three hyperspectral\ndatasets demonstrate that the proposed CST is superior to other\nstate-of-the-art methods both quantitatively and visually. The code is\navailable at \\url{https://github.com/Tomchenshi/CST.git}.\n","authors":["Shi Chen","Lefei Zhang","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.17340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17339v1","updated":"2023-11-29T03:37:14Z","published":"2023-11-29T03:37:14Z","title":"RADAP: A Robust and Adaptive Defense Against Diverse Adversarial Patches\n on Face Recognition","summary":" Face recognition (FR) systems powered by deep learning have become widely\nused in various applications. However, they are vulnerable to adversarial\nattacks, especially those based on local adversarial patches that can be\nphysically applied to real-world objects. In this paper, we propose RADAP, a\nrobust and adaptive defense mechanism against diverse adversarial patches in\nboth closed-set and open-set FR systems. RADAP employs innovative techniques,\nsuch as FCutout and F-patch, which use Fourier space sampling masks to improve\nthe occlusion robustness of the FR model and the performance of the patch\nsegmenter. Moreover, we introduce an edge-aware binary cross-entropy (EBCE)\nloss function to enhance the accuracy of patch detection. We also present the\nsplit and fill (SAF) strategy, which is designed to counter the vulnerability\nof the patch segmenter to complete white-box adaptive attacks. We conduct\ncomprehensive experiments to validate the effectiveness of RADAP, which shows\nsignificant improvements in defense performance against various adversarial\npatches, while maintaining clean accuracy higher than that of the undefended\nVanilla model.\n","authors":["Xiaoliang Liu","Furao Shen","Jian Zhao","Changhai Nie"],"pdf_url":"https://arxiv.org/pdf/2311.17339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17338v1","updated":"2023-11-29T03:36:07Z","published":"2023-11-29T03:36:07Z","title":"VideoAssembler: Identity-Consistent Video Generation with Reference\n Entities using Diffusion Model","summary":" Identity-consistent video generation seeks to synthesize videos that are\nguided by both textual prompts and reference images of entities. Current\napproaches typically utilize cross-attention layers to integrate the appearance\nof the entity, which predominantly captures semantic attributes, resulting in\ncompromised fidelity of entities. Moreover, these methods necessitate iterative\nfine-tuning for each new entity encountered, thereby limiting their\napplicability. To address these challenges, we introduce VideoAssembler, a\nnovel end-to-end framework for identity-consistent video generation that can\nconduct inference directly when encountering new entities. VideoAssembler is\nadept at producing videos that are not only flexible with respect to the input\nreference entities but also responsive to textual conditions. Additionally, by\nmodulating the quantity of input images for the entity, VideoAssembler enables\nthe execution of tasks ranging from image-to-video generation to sophisticated\nvideo editing. VideoAssembler comprises two principal components: the Reference\nEntity Pyramid (REP) encoder and the Entity-Prompt Attention Fusion (EPAF)\nmodule. The REP encoder is designed to infuse comprehensive appearance details\ninto the denoising stages of the stable diffusion model. Concurrently, the EPAF\nmodule is utilized to integrate text-aligned features effectively. Furthermore,\nto mitigate the challenge of scarce data, we present a methodology for the\npreprocessing of training data. Our evaluation of the VideoAssembler framework\non the UCF-101, MSR-VTT, and DAVIS datasets indicates that it achieves good\nperformances in both quantitative and qualitative analyses (346.84 in FVD and\n48.01 in IS on UCF-101). Our project page is at\nhttps://videoassembler.github.io/videoassembler.\n","authors":["Haoyu Zhao","Tianyi Lu","Jiaxi Gu","Xing Zhang","Zuxuan Wu","Hang Xu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15068v3","updated":"2023-11-29T03:33:51Z","published":"2023-08-29T07:00:35Z","title":"A Comprehensive Augmentation Framework for Anomaly Detection","summary":" Data augmentation methods are commonly integrated into the training of\nanomaly detection models. Previous approaches have primarily focused on\nreplicating real-world anomalies or enhancing diversity, without considering\nthat the standard of anomaly varies across different classes, potentially\nleading to a biased training distribution.This paper analyzes crucial traits of\nsimulated anomalies that contribute to the training of reconstructive networks\nand condenses them into several methods, thus creating a comprehensive\nframework by selectively utilizing appropriate combinations.Furthermore, we\nintegrate this framework with a reconstruction-based approach and concurrently\npropose a split training strategy that alleviates the issue of overfitting\nwhile avoiding introducing interference to the reconstruction process. The\nevaluations conducted on the MVTec anomaly detection dataset demonstrate that\nour method outperforms the previous state-of-the-art approach, particularly in\nterms of object classes. To evaluate generalizability, we generate a simulated\ndataset comprising anomalies with diverse characteristics since the original\ntest samples only include specific types of anomalies and may lead to biased\nevaluations. Experimental results demonstrate that our approach exhibits\npromising potential for generalizing effectively to various unforeseen\nanomalies encountered in real-world scenarios.\n","authors":["Jiang Lin","Yaping Yan"],"pdf_url":"https://arxiv.org/pdf/2308.15068v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16337v2","updated":"2023-11-29T03:24:31Z","published":"2023-11-27T21:53:17Z","title":"Multi-3D-Models Registration-Based Augmented Reality (AR) Instructions\n for Assembly","summary":" This paper introduces a novel, markerless, step-by-step, in-situ 3D Augmented\nReality (AR) instruction method and its application - BRICKxAR (Multi 3D\nModels/M3D) - for small parts assembly. BRICKxAR (M3D) realistically visualizes\nrendered 3D assembly parts at the assembly location of the physical assembly\nmodel (Figure 1). The user controls the assembly process through a user\ninterface. BRICKxAR (M3D) utilizes deep learning-trained 3D model-based\nregistration. Object recognition and tracking become challenging as the\nassembly model updates at each step. Additionally, not every part in a 3D\nassembly may be visible to the camera during the assembly. BRICKxAR (M3D)\ncombines multiple assembly phases with a step count to address these\nchallenges. Thus, using fewer phases simplifies the complex assembly process\nwhile step count facilitates accurate object recognition and precise\nvisualization of each step. A testing and heuristic evaluation of the BRICKxAR\n(M3D) prototype and qualitative analysis were conducted with users and experts\nin visualization and human-computer interaction. Providing robust 3D AR\ninstructions and allowing the handling of the assembly model, BRICKxAR (M3D)\nhas the potential to be used at different scales ranging from manufacturing\nassembly to construction.\n","authors":["Seda Tuzun Canadinc","Wei Yan"],"pdf_url":"https://arxiv.org/pdf/2311.16337v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.17780v1","updated":"2023-11-29T16:26:00Z","published":"2023-11-29T16:26:00Z","title":"$Q_{bias}$ -- A Dataset on Media Bias in Search Queries and Query\n Suggestions","summary":" This publication describes the motivation and generation of $Q_{bias}$, a\nlarge dataset of Google and Bing search queries, a scraping tool and dataset\nfor biased news articles, as well as language models for the investigation of\nbias in online search. Web search engines are a major factor and trusted source\nin information search, especially in the political domain. However, biased\ninformation can influence opinion formation and lead to biased opinions. To\ninteract with search engines, users formulate search queries and interact with\nsearch query suggestions provided by the search engines. A lack of datasets on\nsearch queries inhibits research on the subject. We use $Q_{bias}$ to evaluate\ndifferent approaches to fine-tuning transformer-based language models with the\ngoal of producing models capable of biasing text with left and right political\nstance. Additionally to this work we provided datasets and language models for\nbiasing texts that allow further research on bias in online information search.\n","authors":["Fabian Haak","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2311.17780v1.pdf","comment":"Paper accepted at ACM Web Science Conference 2023. 6 pages"},{"id":"http://arxiv.org/abs/2311.13534v3","updated":"2023-11-29T16:18:38Z","published":"2023-11-22T17:14:54Z","title":"LM-Cocktail: Resilient Tuning of Language Models via Model Merging","summary":" The pre-trained language models are continually fine-tuned to better support\ndownstream applications. However, this operation may result in significant\nperformance degeneration on general tasks beyond the targeted domain. To\novercome this problem, we propose LM-Cocktail which enables the fine-tuned\nmodel to stay resilient in general perspectives. Our method is conducted in the\nform of model merging, where the fine-tuned language model is merged with the\npre-trained base model or the peer models from other domains through weighted\naverage. Despite simplicity, LM-Cocktail is surprisingly effective: the\nresulted model is able to achieve a strong empirical performance in the whole\nscope of general tasks while preserving a superior capacity in its targeted\ndomain. We conduct comprehensive experiments with LLama and BGE model on\npopular benchmarks, including FLAN, MMLU, MTEB, whose results validate the\nefficacy of our proposed method. The code and checkpoints are available at\nhttps://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Xingrun Xing"],"pdf_url":"https://arxiv.org/pdf/2311.13534v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00402v4","updated":"2023-11-29T15:00:14Z","published":"2023-09-30T14:55:44Z","title":"DiskANN++: Efficient Page-based Search over Isomorphic Mapped Graph\n Index using Query-sensitivity Entry Vertex","summary":" Given a vector dataset $\\mathcal{X}$ and a query vector $\\vec{x}_q$,\ngraph-based Approximate Nearest Neighbor Search (ANNS) aims to build a graph\nindex $G$ and approximately return vectors with minimum distances to\n$\\vec{x}_q$ by searching over $G$. The main drawback of graph-based ANNS is\nthat a graph index would be too large to fit into the memory especially for a\nlarge-scale $\\mathcal{X}$. To solve this, a Product Quantization (PQ)-based\nhybrid method called DiskANN is proposed to store a low-dimensional PQ index in\nmemory and retain a graph index in SSD, thus reducing memory overhead while\nensuring a high search accuracy. However, it suffers from two I/O issues that\nsignificantly affect the overall efficiency: (1) long routing path from an\nentry vertex to the query's neighborhood that results in large number of I/O\nrequests and (2) redundant I/O requests during the routing process. We propose\nan optimized DiskANN++ to overcome above issues. Specifically, for the first\nissue, we present a query-sensitive entry vertex selection strategy to replace\nDiskANN's static graph-central entry vertex by a dynamically determined entry\nvertex that is close to the query. For the second I/O issue, we present an\nisomorphic mapping on DiskANN's graph index to optimize the SSD layout and\npropose an asynchronously optimized Pagesearch based on the optimized SSD\nlayout as an alternative to DiskANN's beamsearch. Comprehensive experimental\nstudies on eight real-world datasets demonstrate our DiskANN++'s superiority on\nefficiency. We achieve a notable 1.5 X to 2.2 X improvement on QPS compared to\nDiskANN, given the same accuracy constraint.\n","authors":["Jiongkang Ni","Xiaoliang Xu","Yuxiang Wang","Can Li","Jiajie Yao","Shihai Xiao","Xuecang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.00402v4.pdf","comment":"15 pages including references"},{"id":"http://arxiv.org/abs/2310.03605v3","updated":"2023-11-29T14:30:29Z","published":"2023-10-05T15:36:35Z","title":"FASER: Binary Code Similarity Search through the use of Intermediate\n Representations","summary":" Being able to identify functions of interest in cross-architecture software\nis useful whether you are analysing for malware, securing the software supply\nchain or conducting vulnerability research. Cross-Architecture Binary Code\nSimilarity Search has been explored in numerous studies and has used a wide\nrange of different data sources to achieve its goals. The data sources\ntypically used draw on common structures derived from binaries such as function\ncontrol flow graphs or binary level call graphs, the output of the disassembly\nprocess or the outputs of a dynamic analysis approach. One data source which\nhas received less attention is binary intermediate representations. Binary\nIntermediate representations possess two interesting properties: they are cross\narchitecture by their very nature and encode the semantics of a function\nexplicitly to support downstream usage. Within this paper we propose Function\nas a String Encoded Representation (FASER) which combines long document\ntransformers with the use of intermediate representations to create a model\ncapable of cross architecture function search without the need for manual\nfeature engineering, pre-training or a dynamic analysis step. We compare our\napproach against a series of baseline approaches for two tasks; A general\nfunction search task and a targeted vulnerability search task. Our approach\ndemonstrates strong performance across both tasks, performing better than all\nbaseline approaches.\n","authors":["Josh Collyer","Tim Watson","Iain Phillips"],"pdf_url":"https://arxiv.org/pdf/2310.03605v3.pdf","comment":"10 pages, Proceedings of the Conference on Applied Machine Learning\n in Information Security (CAMLIS)"},{"id":"http://arxiv.org/abs/2311.17650v1","updated":"2023-11-29T14:12:00Z","published":"2023-11-29T14:12:00Z","title":"Creator Context for Tweet Recommendation","summary":" When discussing a tweet, people usually not only refer to the content it\ndelivers, but also to the person behind the tweet. In other words, grounding\nthe interpretation of the tweet in the context of its creator plays an\nimportant role in deciphering the true intent and the importance of the tweet.\n In this paper, we attempt to answer the question of how creator context\nshould be used to advance tweet understanding. Specifically, we investigate the\nusefulness of different types of creator context, and examine different model\nstructures for incorporating creator context in tweet modeling. We evaluate our\ntweet understanding models on a practical use case -- recommending relevant\ntweets to news articles. This use case already exists in popular news apps, and\ncan also serve as a useful assistive tool for journalists. We discover that\ncreator context is essential for tweet understanding, and can improve\napplication metrics by a large margin. However, we also observe that not all\ncreator contexts are equal. Creator context can be time sensitive and noisy.\nCareful creator context selection and deliberate model structure design play an\nimportant role in creator context effectiveness.\n","authors":["Spurthi Amba Hombaiah","Tao Chen","Mingyang Zhang","Michael Bendersky","Marc Najork","Matt Colen","Sergey Levi","Vladimir Ofitserov","Tanvir Amin"],"pdf_url":"https://arxiv.org/pdf/2311.17650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17374v1","updated":"2023-11-29T05:59:24Z","published":"2023-11-29T05:59:24Z","title":"Attribute Simulation for Item Embedding Enhancement in Multi-interest\n Recommendation","summary":" Although multi-interest recommenders have achieved significant progress in\nthe matching stage, our research reveals that existing models tend to exhibit\nan under-clustered item embedding space, which leads to a low discernibility\nbetween items and hampers item retrieval. This highlights the necessity for\nitem embedding enhancement. However, item attributes, which serve as effective\nand straightforward side information for enhancement, are either unavailable or\nincomplete in many public datasets due to the labor-intensive nature of manual\nannotation tasks. This dilemma raises two meaningful questions: 1. Can we\nbypass manual annotation and directly simulate complete attribute information\nfrom the interaction data? And 2. If feasible, how to simulate attributes with\nhigh accuracy and low complexity in the matching stage?\n In this paper, we first establish an inspiring theoretical feasibility that\nthe item-attribute correlation matrix can be approximated through elementary\ntransformations on the item co-occurrence matrix. Then based on formula\nderivation, we propose a simple yet effective module, SimEmb (Item Embedding\nEnhancement via Simulated Attribute), in the multi-interest recommendation of\nthe matching stage to implement our findings. By simulating attributes with the\nco-occurrence matrix, SimEmb discards the item ID-based embedding and employs\nthe attribute-weighted summation for item embedding enhancement. Comprehensive\nexperiments on four benchmark datasets demonstrate that our approach notably\nenhances the clustering of item embedding and significantly outperforms SOTA\nmodels with an average improvement of 25.59% on Recall@20.\n","authors":["Yaokun Liu","Xiaowang Zhang","Minghui Zou","Zhiyong Feng"],"pdf_url":"https://arxiv.org/pdf/2311.17374v1.pdf","comment":"This paper has been accepted by the 17th ACM International Conference\n on Web Search and Data Mining (WSDM 2024). The camera-ready version will be\n available in the conference proceedings"},{"id":"http://arxiv.org/abs/2311.00423v5","updated":"2023-11-29T05:52:56Z","published":"2023-11-01T10:27:44Z","title":"LLMRec: Large Language Models with Graph Augmentation for Recommendation","summary":" The problem of data sparsity has long been a challenge in recommendation\nsystems, and previous studies have attempted to address this issue by\nincorporating side information. However, this approach often introduces side\neffects such as noise, availability issues, and low data quality, which in turn\nhinder the accurate modeling of user preferences and adversely impact\nrecommendation performance. In light of the recent advancements in large\nlanguage models (LLMs), which possess extensive knowledge bases and strong\nreasoning capabilities, we propose a novel framework called LLMRec that\nenhances recommender systems by employing three simple yet effective LLM-based\ngraph augmentation strategies. Our approach leverages the rich content\navailable within online platforms (e.g., Netflix, MovieLens) to augment the\ninteraction graph in three ways: (i) reinforcing user-item interaction egde,\n(ii) enhancing the understanding of item node attributes, and (iii) conducting\nuser node profiling, intuitively from the natural language perspective. By\nemploying these strategies, we address the challenges posed by sparse implicit\nfeedback and low-quality side information in recommenders. Besides, to ensure\nthe quality of the augmentation, we develop a denoised data robustification\nmechanism that includes techniques of noisy implicit feedback pruning and\nMAE-based feature enhancement that help refine the augmented data and improve\nits reliability. Furthermore, we provide theoretical analysis to support the\neffectiveness of LLMRec and clarify the benefits of our method in facilitating\nmodel optimization. Experimental results on benchmark datasets demonstrate the\nsuperiority of our LLM-based augmentation approach over state-of-the-art\ntechniques. To ensure reproducibility, we have made our code and augmented data\npublicly available at: https://github.com/HKUDS/LLMRec.git\n","authors":["Wei Wei","Xubin Ren","Jiabin Tang","Qinyong Wang","Lixin Su","Suqi Cheng","Junfeng Wang","Dawei Yin","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.00423v5.pdf","comment":"WSDM 2024 Oral Presentation"},{"id":"http://arxiv.org/abs/2305.11164v3","updated":"2023-11-29T23:07:15Z","published":"2023-05-18T17:52:58Z","title":"Exploring the Carbon Footprint of Hugging Face's ML Models: A Repository\n Mining Study","summary":" The rise of machine learning (ML) systems has exacerbated their carbon\nfootprint due to increased capabilities and model sizes. However, there is\nscarce knowledge on how the carbon footprint of ML models is actually measured,\nreported, and evaluated. In light of this, the paper aims to analyze the\nmeasurement of the carbon footprint of 1,417 ML models and associated datasets\non Hugging Face, which is the most popular repository for pretrained ML models.\nThe goal is to provide insights and recommendations on how to report and\noptimize the carbon efficiency of ML models. The study includes the first\nrepository mining study on the Hugging Face Hub API on carbon emissions. This\nstudy seeks to answer two research questions: (1) how do ML model creators\nmeasure and report carbon emissions on Hugging Face Hub?, and (2) what aspects\nimpact the carbon emissions of training ML models? The study yielded several\nkey findings. These include a stalled proportion of carbon emissions-reporting\nmodels, a slight decrease in reported carbon footprint on Hugging Face over the\npast 2 years, and a continued dominance of NLP as the main application domain.\nFurthermore, the study uncovers correlations between carbon emissions and\nvarious attributes such as model size, dataset size, and ML application\ndomains. These results highlight the need for software measurements to improve\nenergy reporting practices and promote carbon-efficient model development\nwithin the Hugging Face community. In response to this issue, two\nclassifications are proposed: one for categorizing models based on their carbon\nemission reporting practices and another for their carbon efficiency. The aim\nof these classification proposals is to foster transparency and sustainable\nmodel development within the ML community.\n","authors":["Joel Castaño","Silverio Martínez-Fernández","Xavier Franch","Justus Bogner"],"pdf_url":"https://arxiv.org/pdf/2305.11164v3.pdf","comment":"Accepted at the 2023 ACM/IEEE International Symposium on Empirical\n Software Engineering and Measurement (ESEM)"},{"id":"http://arxiv.org/abs/2311.18118v1","updated":"2023-11-29T22:13:53Z","published":"2023-11-29T22:13:53Z","title":"AnonPSI: An Anonymity Assessment Framework for PSI","summary":" Private Set Intersection (PSI) is a widely used protocol that enables two\nparties to securely compute a function over the intersected part of their\nshared datasets and has been a significant research focus over the years.\nHowever, recent studies have highlighted its vulnerability to Set Membership\nInference Attacks (SMIA), where an adversary might deduce an individual's\nmembership by invoking multiple PSI protocols. This presents a considerable\nrisk, even in the most stringent versions of PSI, which only return the\ncardinality of the intersection. This paper explores the evaluation of\nanonymity within the PSI context. Initially, we highlight the reasons why\nexisting works fall short in measuring privacy leakage, and subsequently\npropose two attack strategies that address these deficiencies. Furthermore, we\nprovide theoretical guarantees on the performance of our proposed methods. In\naddition to these, we illustrate how the integration of auxiliary information,\nsuch as the sum of payloads associated with members of the intersection\n(PSI-SUM), can enhance attack efficiency. We conducted a comprehensive\nperformance evaluation of various attack strategies proposed utilizing two real\ndatasets. Our findings indicate that the methods we propose markedly enhance\nattack efficiency when contrasted with previous research endeavors. {The\neffective attacking implies that depending solely on existing PSI protocols may\nnot provide an adequate level of privacy assurance. It is recommended to\ncombine privacy-enhancing technologies synergistically to enhance privacy\nprotection even further.\n","authors":["Bo Jiang","Jian Du","Qiang Yan"],"pdf_url":"https://arxiv.org/pdf/2311.18118v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2305.19891v3","updated":"2023-11-29T18:58:05Z","published":"2023-05-31T14:26:14Z","title":"Dynamic Neighborhood Construction for Structured Large Discrete Action\n Spaces","summary":" Large discrete action spaces (LDAS) remain a central challenge in\nreinforcement learning. Existing solution approaches can handle unstructured\nLDAS with up to a few million actions. However, many real-world applications in\nlogistics, production, and transportation systems have combinatorial action\nspaces, whose size grows well beyond millions of actions, even on small\ninstances. Fortunately, such action spaces exhibit structure, e.g., equally\nspaced discrete resource units. With this work, we focus on handling structured\nLDAS (SLDAS) with sizes that cannot be handled by current benchmarks: we\npropose Dynamic Neighborhood Construction (DNC), a novel exploitation paradigm\nfor SLDAS. We present a scalable neighborhood exploration heuristic that\nutilizes this paradigm and efficiently explores the discrete neighborhood\naround the continuous proxy action in structured action spaces with up to\n$10^{73}$ actions. We demonstrate the performance of our method by benchmarking\nit against three state-of-the-art approaches designed for large discrete action\nspaces across two distinct environments. Our results show that DNC matches or\noutperforms state-of-the-art approaches while being computationally more\nefficient. Furthermore, our method scales to action spaces that so far remained\ncomputationally intractable for existing methodologies.\n","authors":["Fabian Akkerman","Julius Luy","Wouter van Heeswijk","Maximilian Schiffer"],"pdf_url":"https://arxiv.org/pdf/2305.19891v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.07626v2","updated":"2023-11-29T18:57:23Z","published":"2022-08-16T09:24:47Z","title":"Algorithmic Assistance with Recommendation-Dependent Preferences","summary":" When we use algorithms to produce risk assessments, we typically think of\nthese predictions as providing helpful input to human decisions, such as when\nrisk scores are presented to judges or doctors. But when a decision-maker\nobtains algorithmic assistance, they may not only react to the information. The\ndecision-maker may view the input of the algorithm as recommending a default\naction, making it costly for them to deviate, such as when a judge is reluctant\nto overrule a high-risk assessment of a defendant or a doctor fears the\nconsequences of deviating from recommended procedures. In this article, we\npropose a principal-agent model of joint human-machine decision-making. Within\nthis model, we consider the effect and design of algorithmic recommendations\nwhen they affect choices not just by shifting beliefs, but also by altering\npreferences. We motivate this assumption from institutional factors, such as a\ndesire to avoid audits, as well as from well-established models in behavioral\nscience that predict loss aversion relative to a reference point, which here is\nset by the algorithm. We show that recommendation-dependent preferences create\ninefficiencies where the decision-maker is overly responsive to the\nrecommendation. As a potential remedy, we discuss algorithms that strategically\nwithhold recommendations, and show how they can improve the quality of final\ndecisions.\n","authors":["Bryce McLaughlin","Jann Spiess"],"pdf_url":"https://arxiv.org/pdf/2208.07626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17901v1","updated":"2023-11-29T18:53:34Z","published":"2023-11-29T18:53:34Z","title":"SODA: Bottleneck Diffusion Models for Representation Learning","summary":" We introduce SODA, a self-supervised diffusion model, designed for\nrepresentation learning. The model incorporates an image encoder, which\ndistills a source view into a compact representation, that, in turn, guides the\ngeneration of related novel views. We show that by imposing a tight bottleneck\nbetween the encoder and a denoising decoder, and leveraging novel view\nsynthesis as a self-supervised objective, we can turn diffusion models into\nstrong representation learners, capable of capturing visual semantics in an\nunsupervised manner. To the best of our knowledge, SODA is the first diffusion\nmodel to succeed at ImageNet linear-probe classification, and, at the same\ntime, it accomplishes reconstruction, editing and synthesis tasks across a wide\nrange of datasets. Further investigation reveals the disentangled nature of its\nemergent latent space, that serves as an effective interface to control and\nmanipulate the model's produced images. All in all, we aim to shed light on the\nexciting and promising potential of diffusion models, not only for image\ngeneration, but also for learning rich and robust representations.\n","authors":["Drew A. Hudson","Daniel Zoran","Mateusz Malinowski","Andrew K. Lampinen","Andrew Jaegle","James L. McClelland","Loic Matthey","Felix Hill","Alexander Lerchner"],"pdf_url":"https://arxiv.org/pdf/2311.17901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17898v1","updated":"2023-11-29T18:51:46Z","published":"2023-11-29T18:51:46Z","title":"Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis","summary":" Hallucinations and unfaithful synthesis due to inaccurate prompts with\ninsufficient semantic details are widely observed in multimodal generative\nmodels. A prevalent strategy to align multiple modalities is to fine-tune the\ngenerator with a large number of annotated text-image pairs. However, such a\nprocedure is labor-consuming and resource-draining. The key question we ask is:\ncan we enhance the quality and faithfulness of text-driven generative models\nbeyond extensive text-image pair annotations? To address this question, we\npropose Knowledge Pursuit Prompting (KPP), a zero-shot framework that\niteratively incorporates external knowledge to help generators produce reliable\nvisual content. Instead of training generators to handle generic prompts, KPP\nemploys a recursive knowledge query process to gather informative external\nfacts from the knowledge base, instructs a language model to compress the\nacquired knowledge for prompt refinement, and utilizes text-driven generators\nfor visual synthesis. The entire process is zero-shot, without accessing the\narchitectures and parameters of generative models. We evaluate the framework\nacross multiple text-driven generative tasks (image, 3D rendering, and video)\non datasets of different domains. We further demonstrate the extensibility and\nadaptability of KPP through varying foundation model bases and instructions.\nOur results show that KPP is capable of generating faithful and semantically\nrich content across diverse visual domains, offering a promising solution to\nimprove multimodal generative models.\n","authors":["Jinqi Luo","Kwan Ho Ryan Chan","Dimitris Dimos","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2311.17898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10649v2","updated":"2023-11-29T18:45:05Z","published":"2022-11-19T10:21:50Z","title":"LibSignal: An Open Library for Traffic Signal Control","summary":" This paper introduces a library for cross-simulator comparison of\nreinforcement learning models in traffic signal control tasks. This library is\ndeveloped to implement recent state-of-the-art reinforcement learning models\nwith extensible interfaces and unified cross-simulator evaluation metrics. It\nsupports commonly-used simulators in traffic signal control tasks, including\nSimulation of Urban MObility(SUMO) and CityFlow, and multiple benchmark\ndatasets for fair comparisons. We conducted experiments to validate our\nimplementation of the models and to calibrate the simulators so that the\nexperiments from one simulator could be referential to the other. Based on the\nvalidated models and calibrated environments, this paper compares and reports\nthe performance of current state-of-the-art RL algorithms across different\ndatasets and simulators. This is the first time that these methods have been\ncompared fairly under the same datasets with different simulators.\n","authors":["Hao Mei","Xiaoliang Lei","Longchao Da","Bin Shi","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2211.10649v2.pdf","comment":"11 pages + 6 pages appendix. Accepted by Machine Learning Journal\n (2023). A short version is accepted by NeurIPS 2022 Workshop: Reinforcement\n Learning for Real Life. Website: https://darl-libsignal.github.io/"},{"id":"http://arxiv.org/abs/2309.06800v5","updated":"2023-11-29T18:38:49Z","published":"2023-09-13T08:48:00Z","title":"Uncertainty-aware Traffic Prediction under Missing Data","summary":" Traffic prediction is a crucial topic because of its broad scope of\napplications in the transportation domain. Recently, various studies have\nachieved promising results. However, most studies assume the prediction\nlocations have complete or at least partial historical records and cannot be\nextended to non-historical recorded locations. In real-life scenarios, the\ndeployment of sensors could be limited due to budget limitations and\ninstallation availability, which makes most current models not applicable.\nThough few pieces of literature tried to impute traffic states at the missing\nlocations, these methods need the data simultaneously observed at the locations\nwith sensors, making them not applicable to prediction tasks. Another drawback\nis the lack of measurement of uncertainty in prediction, making prior works\nunsuitable for risk-sensitive tasks or involving decision-making. To fill the\ngap, inspired by the previous inductive graph neural network, this work\nproposed an uncertainty-aware framework with the ability to 1) extend\nprediction to missing locations with no historical records and significantly\nextend spatial coverage of prediction locations while reducing deployment of\nsensors and 2) generate probabilistic prediction with uncertainty\nquantification to help the management of risk and decision making in the\ndown-stream tasks. Through extensive experiments on real-life datasets, the\nresult shows our method achieved promising results on prediction tasks, and the\nuncertainty quantification gives consistent results which highly correlated\nwith the locations with and without historical data. We also show that our\nmodel could help support sensor deployment tasks in the transportation field to\nachieve higher accuracy with a limited sensor deployment budget.\n","authors":["Hao Mei","Junxian Li","Zhiming Liang","Guanjie Zheng","Bin Shi","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2309.06800v5.pdf","comment":"11 pages, 3 figures, a short version of this paper is accepted by\n ICDM 2023"},{"id":"http://arxiv.org/abs/2311.17885v1","updated":"2023-11-29T18:32:37Z","published":"2023-11-29T18:32:37Z","title":"Are ensembles getting better all the time?","summary":" Ensemble methods combine the predictions of several base models. We study\nwhether or not including more models in an ensemble always improve its average\nperformance. Such a question depends on the kind of ensemble considered, as\nwell as the predictive metric chosen. We focus on situations where all members\nof the ensemble are a priori expected to perform as well, which is the case of\nseveral popular methods like random forests or deep ensembles. In this setting,\nwe essentially show that ensembles are getting better all the time if, and only\nif, the considered loss function is convex. More precisely, in that case, the\naverage loss of the ensemble is a decreasing function of the number of models.\nWhen the loss function is nonconvex, we show a series of results that can be\nsummarised by the insight that ensembles of good models keep getting better,\nand ensembles of bad models keep getting worse. To this end, we prove a new\nresult on the monotonicity of tail probabilities that may be of independent\ninterest. We illustrate our results on a simple machine learning problem\n(diagnosing melanomas using neural nets).\n","authors":["Pierre-Alexandre Mattei","Damien Garreau"],"pdf_url":"https://arxiv.org/pdf/2311.17885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10586v3","updated":"2023-11-29T18:23:57Z","published":"2022-06-21T17:59:20Z","title":"D-CIPHER: Discovery of Closed-form Partial Differential Equations","summary":" Closed-form differential equations, including partial differential equations\nand higher-order ordinary differential equations, are one of the most important\ntools used by scientists to model and better understand natural phenomena.\nDiscovering these equations directly from data is challenging because it\nrequires modeling relationships between various derivatives that are not\nobserved in the data (equation-data mismatch) and it involves searching across\na huge space of possible equations. Current approaches make strong assumptions\nabout the form of the equation and thus fail to discover many well-known\nsystems. Moreover, many of them resolve the equation-data mismatch by\nestimating the derivatives, which makes them inadequate for noisy and\ninfrequently sampled systems. To this end, we propose D-CIPHER, which is robust\nto measurement artifacts and can uncover a new and very general class of\ndifferential equations. We further design a novel optimization procedure,\nCoLLie, to help D-CIPHER search through this class efficiently. Finally, we\ndemonstrate empirically that it can discover many well-known equations that are\nbeyond the capabilities of current methods.\n","authors":["Krzysztof Kacprzyk","Zhaozhi Qian","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2206.10586v3.pdf","comment":"To appear in the Proceedings of the 37th Conference on Neural\n Information Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2311.17869v1","updated":"2023-11-29T18:17:35Z","published":"2023-11-29T18:17:35Z","title":"SAIBench: A Structural Interpretation of AI for Science Through\n Benchmarks","summary":" Artificial Intelligence for Science (AI4S) is an emerging research field that\nutilizes machine learning advancements to tackle complex scientific\ncomputational issues, aiming to enhance computational efficiency and accuracy.\nHowever, the data-driven nature of AI4S lacks the correctness or accuracy\nassurances of conventional scientific computing, posing challenges when\ndeploying AI4S models in real-world applications. To mitigate these, more\ncomprehensive benchmarking procedures are needed to better understand AI4S\nmodels. This paper introduces a novel benchmarking approach, known as\nstructural interpretation, which addresses two key requirements: identifying\nthe trusted operating range in the problem space and tracing errors back to\ntheir computational components. This method partitions both the problem and\nmetric spaces, facilitating a structural exploration of these spaces. The\npractical utility and effectiveness of structural interpretation are\nillustrated through its application to three distinct AI4S workloads:\nmachine-learning force fields (MLFF), jet tagging, and precipitation\nnowcasting. The benchmarks effectively model the trusted operating range, trace\nerrors, and reveal novel perspectives for refining the model, training process,\nand data sampling strategy. This work is part of the SAIBench project, an AI4S\nbenchmarking suite.\n","authors":["Yatao Li","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2311.17869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.03865v3","updated":"2023-11-29T18:11:49Z","published":"2021-12-07T17:59:10Z","title":"Universalizing Weak Supervision","summary":" Weak supervision (WS) frameworks are a popular way to bypass hand-labeling\nlarge datasets for training data-hungry models. These approaches synthesize\nmultiple noisy but cheaply-acquired estimates of labels into a set of\nhigh-quality pseudolabels for downstream training. However, the synthesis\ntechnique is specific to a particular kind of label, such as binary labels or\nsequences, and each new label type requires manually designing a new synthesis\nalgorithm. Instead, we propose a universal technique that enables weak\nsupervision over any label type while still offering desirable properties,\nincluding practical flexibility, computational efficiency, and theoretical\nguarantees. We apply this technique to important problems previously not\ntackled by WS frameworks including learning to rank, regression, and learning\nin hyperbolic space. Theoretically, our synthesis approach produces a\nconsistent estimators for learning some challenging but important\ngeneralizations of the exponential family model. Experimentally, we validate\nour framework and show improvement over baselines in diverse settings including\nreal-world learning-to-rank and regression problems along with learning on\nhyperbolic manifolds.\n","authors":["Changho Shin","Winfred Li","Harit Vishwakarma","Nicholas Roberts","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2112.03865v3.pdf","comment":"ICLR 2022"},{"id":"http://arxiv.org/abs/2303.17713v3","updated":"2023-11-29T18:10:41Z","published":"2023-03-30T21:16:44Z","title":"Mitigating Source Bias for Fairer Weak Supervision","summary":" Weak supervision enables efficient development of training sets by reducing\nthe need for ground truth labels. However, the techniques that make weak\nsupervision attractive -- such as integrating any source of signal to estimate\nunknown labels -- also entail the danger that the produced pseudolabels are\nhighly biased. Surprisingly, given everyday use and the potential for increased\nbias, weak supervision has not been studied from the point of view of fairness.\nWe begin such a study, starting with the observation that even when a fair\nmodel can be built from a dataset with access to ground-truth labels, the\ncorresponding dataset labeled via weak supervision can be arbitrarily unfair.\nTo address this, we propose and empirically validate a model for source\nunfairness in weak supervision, then introduce a simple counterfactual\nfairness-based technique that can mitigate these biases. Theoretically, we show\nthat it is possible for our approach to simultaneously improve both accuracy\nand fairness -- in contrast to standard fairness approaches that suffer from\ntradeoffs. Empirically, we show that our technique improves accuracy on weak\nsupervision baselines by as much as 32\\% while reducing demographic parity gap\nby 82.5\\%. A simple extension of our method aimed at maximizing performance\nproduces state-of-the-art performance in five out of ten datasets in the WRENCH\nbenchmark.\n","authors":["Changho Shin","Sonia Cromp","Dyah Adila","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2303.17713v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17856v1","updated":"2023-11-29T18:02:29Z","published":"2023-11-29T18:02:29Z","title":"Leveraging Graph Diffusion Models for Network Refinement Tasks","summary":" Most real-world networks are noisy and incomplete samples from an unknown\ntarget distribution. Refining them by correcting corruptions or inferring\nunobserved regions typically improves downstream performance. Inspired by the\nimpressive generative capabilities that have been used to correct corruptions\nin images, and the similarities between \"in-painting\" and filling in missing\nnodes and edges conditioned on the observed graph, we propose a novel graph\ngenerative framework, SGDM, which is based on subgraph diffusion. Our framework\nnot only improves the scalability and fidelity of graph diffusion models, but\nalso leverages the reverse process to perform novel, conditional generation\ntasks. In particular, through extensive empirical analysis and a set of novel\nmetrics, we demonstrate that our proposed model effectively supports the\nfollowing refinement tasks for partially observable networks: T1: denoising\nextraneous subgraphs, T2: expanding existing subgraphs and T3: performing\n\"style\" transfer by regenerating a particular subgraph to match the\ncharacteristics of a different node or subgraph.\n","authors":["Puja Trivedi","Ryan Rossi","David Arbour","Tong Yu","Franck Dernoncourt","Sungchul Kim","Nedim Lipka","Namyong Park","Nesreen K. Ahmed","Danai Koutra"],"pdf_url":"https://arxiv.org/pdf/2311.17856v1.pdf","comment":"Work in Progress. 21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.17855v1","updated":"2023-11-29T18:00:41Z","published":"2023-11-29T18:00:41Z","title":"Maximum Entropy Model Correction in Reinforcement Learning","summary":" We propose and theoretically analyze an approach for planning with an\napproximate model in reinforcement learning that can reduce the adverse impact\nof model error. If the model is accurate enough, it accelerates the convergence\nto the true value function too. One of its key components is the MaxEnt Model\nCorrection (MoCo) procedure that corrects the model's next-state distributions\nbased on a Maximum Entropy density estimation formulation. Based on MoCo, we\nintroduce the Model Correcting Value Iteration (MoCoVI) algorithm, and its\nsampled-based variant MoCoDyna. We show that MoCoVI and MoCoDyna's convergence\ncan be much faster than the conventional model-free algorithms. Unlike\ntraditional model-based algorithms, MoCoVI and MoCoDyna effectively utilize an\napproximate model and still converge to the correct value function.\n","authors":["Amin Rakhsha","Mete Kemertas","Mohammad Ghavamzadeh","Amir-massoud Farahmand"],"pdf_url":"https://arxiv.org/pdf/2311.17855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17853v1","updated":"2023-11-29T17:59:18Z","published":"2023-11-29T17:59:18Z","title":"On the Adversarial Robustness of Graph Contrastive Learning Methods","summary":" Contrastive learning (CL) has emerged as a powerful framework for learning\nrepresentations of images and text in a self-supervised manner while enhancing\nmodel robustness against adversarial attacks. More recently, researchers have\nextended the principles of contrastive learning to graph-structured data,\ngiving birth to the field of graph contrastive learning (GCL). However, whether\nGCL methods can deliver the same advantages in adversarial robustness as their\ncounterparts in the image and text domains remains an open question. In this\npaper, we introduce a comprehensive robustness evaluation protocol tailored to\nassess the robustness of GCL models. We subject these models to adaptive\nadversarial attacks targeting the graph structure, specifically in the evasion\nscenario. We evaluate node and graph classification tasks using diverse\nreal-world datasets and attack strategies. With our work, we aim to offer\ninsights into the robustness of GCL methods and hope to open avenues for\npotential future research directions.\n","authors":["Filippo Guerranti","Zinuo Yi","Anna Starovoit","Rafiq Kamel","Simon Geisler","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2311.17853v1.pdf","comment":"Accepted at NeurIPS 2023 New Frontiers in Graph Learning Workshop\n (NeurIPS GLFrontiers 2023)"},{"id":"http://arxiv.org/abs/2311.17842v1","updated":"2023-11-29T17:46:25Z","published":"2023-11-29T17:46:25Z","title":"Look Before You Leap: Unveiling the Power of GPT-4V in Robotic\n Vision-Language Planning","summary":" In this study, we are interested in imbuing robots with the capability of\nphysically-grounded task planning. Recent advancements have shown that large\nlanguage models (LLMs) possess extensive knowledge useful in robotic tasks,\nespecially in reasoning and planning. However, LLMs are constrained by their\nlack of world grounding and dependence on external affordance models to\nperceive environmental information, which cannot jointly reason with LLMs. We\nargue that a task planner should be an inherently grounded, unified multimodal\nsystem. To this end, we introduce Robotic Vision-Language Planning (ViLa), a\nnovel approach for long-horizon robotic planning that leverages vision-language\nmodels (VLMs) to generate a sequence of actionable steps. ViLa directly\nintegrates perceptual data into its reasoning and planning process, enabling a\nprofound understanding of commonsense knowledge in the visual world, including\nspatial layouts and object attributes. It also supports flexible multimodal\ngoal specification and naturally incorporates visual feedback. Our extensive\nevaluation, conducted in both real-robot and simulated environments,\ndemonstrates ViLa's superiority over existing LLM-based planners, highlighting\nits effectiveness in a wide array of open-world manipulation tasks.\n","authors":["Yingdong Hu","Fanqi Lin","Tong Zhang","Li Yi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2311.17842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17840v1","updated":"2023-11-29T17:42:05Z","published":"2023-11-29T17:42:05Z","title":"A quasi-polynomial time algorithm for Multi-Dimensional Scaling via LP\n hierarchies","summary":" Multi-dimensional Scaling (MDS) is a family of methods for embedding\npair-wise dissimilarities between $n$ objects into low-dimensional space. MDS\nis widely used as a data visualization tool in the social and biological\nsciences, statistics, and machine learning. We study the Kamada-Kawai\nformulation of MDS: given a set of non-negative dissimilarities $\\{d_{i,j}\\}_{i\n, j \\in [n]}$ over $n$ points, the goal is to find an embedding\n$\\{x_1,\\dots,x_n\\} \\subset \\mathbb{R}^k$ that minimizes \\[ \\text{OPT} =\n\\min_{x} \\mathbb{E}_{i,j \\in [n]} \\left[ \\left(1-\\frac{\\|x_i -\nx_j\\|}{d_{i,j}}\\right)^2 \\right] \\]\n Despite its popularity, our theoretical understanding of MDS is extremely\nlimited. Recently, Demaine, Hesterberg, Koehler, Lynch, and Urschel\n(arXiv:2109.11505) gave the first approximation algorithm with provable\nguarantees for Kamada-Kawai, which achieves an embedding with cost $\\text{OPT}\n+\\epsilon$ in $n^2 \\cdot 2^{\\tilde{\\mathcal{O}}(k \\Delta^4 / \\epsilon^2)}$\ntime, where $\\Delta$ is the aspect ratio of the input dissimilarities. In this\nwork, we give the first approximation algorithm for MDS with quasi-polynomial\ndependency on $\\Delta$: for target dimension $k$, we achieve a solution with\ncost $\\mathcal{O}(\\text{OPT}^{ \\hspace{0.04in}1/k } \\cdot \\log(\\Delta/\\epsilon)\n)+ \\epsilon$ in time $n^{ \\mathcal{O}(1)} \\cdot 2^{\\tilde{\\mathcal{O}}( k^2\n(\\log(\\Delta)/\\epsilon)^{k/2 + 1} ) }$.\n Our approach is based on a novel analysis of a conditioning-based rounding\nscheme for the Sherali-Adams LP Hierarchy. Crucially, our analysis exploits the\ngeometry of low-dimensional Euclidean space, allowing us to avoid an\nexponential dependence on the aspect ratio $\\Delta$. We believe our\ngeometry-aware treatment of the Sherali-Adams Hierarchy is an important step\ntowards developing general-purpose techniques for efficient metric optimization\nalgorithms.\n","authors":["Ainesh Bakshi","Vincent Cohen-Addad","Samuel B. Hopkins","Rajesh Jayaram","Silvio Lattanzi"],"pdf_url":"https://arxiv.org/pdf/2311.17840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11518v3","updated":"2023-11-29T17:39:17Z","published":"2023-10-17T18:33:21Z","title":"Guarantees for Self-Play in Multiplayer Games via Polymatrix\n Decomposability","summary":" Self-play is a technique for machine learning in multi-agent systems where a\nlearning algorithm learns by interacting with copies of itself. Self-play is\nuseful for generating large quantities of data for learning, but has the\ndrawback that the agents the learner will face post-training may have\ndramatically different behavior than the learner came to expect by interacting\nwith itself. For the special case of two-player constant-sum games, self-play\nthat reaches Nash equilibrium is guaranteed to produce strategies that perform\nwell against any post-training opponent; however, no such guarantee exists for\nmultiplayer games. We show that in games that approximately decompose into a\nset of two-player constant-sum games (called constant-sum polymatrix games)\nwhere global $\\epsilon$-Nash equilibria are boundedly far from Nash equilibria\nin each subgame (called subgame stability), any no-external-regret algorithm\nthat learns by self-play will produce a strategy with bounded vulnerability.\nFor the first time, our results identify a structural property of multiplayer\ngames that enable performance guarantees for the strategies produced by a broad\nclass of self-play algorithms. We demonstrate our findings through experiments\non Leduc poker.\n","authors":["Revan MacQueen","James R. Wright"],"pdf_url":"https://arxiv.org/pdf/2310.11518v3.pdf","comment":"To appear at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17833v1","updated":"2023-11-29T17:35:29Z","published":"2023-11-29T17:35:29Z","title":"Analyzing and Explaining Image Classifiers via Diffusion Guidance","summary":" While deep learning has led to huge progress in complex image classification\ntasks like ImageNet, unexpected failure modes, e.g. via spurious features, call\ninto question how reliably these classifiers work in the wild. Furthermore, for\nsafety-critical tasks the black-box nature of their decisions is problematic,\nand explanations or at least methods which make decisions plausible are needed\nurgently. In this paper, we address these problems by generating images that\noptimize a classifier-derived objective using a framework for guided image\ngeneration. We analyze the behavior and decisions of image classifiers by\nvisual counterfactual explanations (VCEs), detection of systematic mistakes by\nanalyzing images where classifiers maximally disagree, and visualization of\nneurons to verify potential spurious features. In this way, we validate\nexisting observations, e.g. the shape bias of adversarially robust models, as\nwell as novel failure modes, e.g. systematic errors of zero-shot CLIP\nclassifiers, or identify harmful spurious features. Moreover, our VCEs\noutperform previous work while being more versatile.\n","authors":["Maximilian Augustin","Yannic Neuhaus","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2311.17833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17801v1","updated":"2023-11-29T16:51:21Z","published":"2023-11-29T16:51:21Z","title":"Towards Efficient Hyperdimensional Computing Using Photonics","summary":" Over the past few years, silicon photonics-based computing has emerged as a\npromising alternative to CMOS-based computing for Deep Neural Networks (DNN).\nUnfortunately, the non-linear operations and the high-precision requirements of\nDNNs make it extremely challenging to design efficient silicon photonics-based\nsystems for DNN inference and training. Hyperdimensional Computing (HDC) is an\nemerging, brain-inspired machine learning technique that enjoys several\nadvantages over existing DNNs, including being lightweight, requiring\nlow-precision operands, and being robust to noise introduced by the\nnonidealities in the hardware. For HDC, computing in-memory (CiM) approaches\nhave been widely used, as CiM reduces the data transfer cost if the operands\ncan fit into the memory. However, inefficient multi-bit operations, high write\nlatency, and low endurance make CiM ill-suited for HDC. On the other hand, the\nexisting electro-photonic DNN accelerators are inefficient for HDC because they\nare specifically optimized for matrix multiplication in DNNs and consume a lot\nof power with high-precision data converters.\n In this paper, we argue that photonic computing and HDC complement each other\nbetter than photonic computing and DNNs, or CiM and HDC. We propose PhotoHDC,\nthe first-ever electro-photonic accelerator for HDC training and inference,\nsupporting the basic, record-based, and graph encoding schemes. Evaluating with\npopular datasets, we show that our accelerator can achieve two to five orders\nof magnitude lower EDP than the state-of-the-art electro-photonic DNN\naccelerators for implementing HDC training and inference. PhotoHDC also\nachieves four orders of magnitude lower energy-delay product than CiM-based\naccelerators for both HDC training and inference.\n","authors":["Farbin Fayza","Cansu Demirkiran","Hanning Chen","Che-Kai Liu","Avi Mohan","Hamza Errahmouni","Sanggeon Yun","Mohsen Imani","David Zhang","Darius Bunandar","Ajay Joshi"],"pdf_url":"https://arxiv.org/pdf/2311.17801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17797v1","updated":"2023-11-29T16:46:24Z","published":"2023-11-29T16:46:24Z","title":"Learning to Simulate: Generative Metamodeling via Quantile Regression","summary":" Stochastic simulation models, while effective in capturing the dynamics of\ncomplex systems, are often too slow to run for real-time decision-making.\nMetamodeling techniques are widely used to learn the relationship between a\nsummary statistic of the outputs (e.g., the mean or quantile) and the inputs of\nthe simulator, so that it can be used in real time. However, this methodology\nrequires the knowledge of an appropriate summary statistic in advance, making\nit inflexible for many practical situations. In this paper, we propose a new\nmetamodeling concept, called generative metamodeling, which aims to construct a\n\"fast simulator of the simulator\". This technique can generate random outputs\nsubstantially faster than the original simulation model, while retaining an\napproximately equal conditional distribution given the same inputs. Once\nconstructed, a generative metamodel can instantaneously generate a large amount\nof random outputs as soon as the inputs are specified, thereby facilitating the\nimmediate computation of any summary statistic for real-time decision-making.\nFurthermore, we propose a new algorithm -- quantile-regression-based generative\nmetamodeling (QRGMM) -- and study its convergence and rate of convergence.\nExtensive numerical experiments are conducted to investigate the empirical\nperformance of QRGMM, compare it with other state-of-the-art generative\nalgorithms, and demonstrate its usefulness in practical real-time\ndecision-making.\n","authors":["L. Jeff Hong","Yanxi Hou","Qingkai Zhang","Xiaowei Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.17797v1.pdf","comment":"Main body: 36 pages, 7 figures; supplemental material: 12 pages"},{"id":"http://arxiv.org/abs/2311.17795v1","updated":"2023-11-29T16:45:43Z","published":"2023-11-29T16:45:43Z","title":"Marginal Laplacian Score","summary":" High-dimensional imbalanced data poses a machine learning challenge. In the\nabsence of sufficient or high-quality labels, unsupervised feature selection\nmethods are crucial for the success of subsequent algorithms. Therefore, there\nis a growing need for unsupervised feature selection algorithms focused on\nimbalanced data. Thus, we propose a Marginal Laplacian Score (MLS) a\nmodification of the well-known Laplacian Score (LS) to be better suited for\nimbalance data. We introduce an assumption that the minority class or anomalous\nappear more frequently in the margin of the features. Consequently, MLS aims to\npreserve the local structure of the data set's margin. As MLS is better suited\nfor handling imbalanced data, we propose its integration into modern feature\nselection methods that utilize the Laplacian score. We integrate the MLS\nalgorithm into the Differentiable Unsupervised Feature Selection (DUFS),\nresulting in DUFS-MLS. The proposed methods demonstrate robust and improved\nperformance on synthetic and public data sets.\n","authors":["Guy Hay","Ohad Volk"],"pdf_url":"https://arxiv.org/pdf/2311.17795v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2305.12534v3","updated":"2023-11-29T16:43:29Z","published":"2023-05-21T18:26:31Z","title":"BertRLFuzzer: A BERT and Reinforcement Learning based Fuzzer","summary":" We present a novel tool BertRLFuzzer, a BERT and Reinforcement Learning (RL)\nbased fuzzer aimed at finding security vulnerabilities for Web applications.\nBertRLFuzzer works as follows: given a set of seed inputs, the fuzzer performs\ngrammar-adhering and attack-provoking mutation operations on them to generate\ncandidate attack vectors. The key insight of BertRLFuzzer is the use of RL with\na BERT model as an agent to guide the fuzzer to efficiently learn\ngrammar-adhering and attack-provoking mutation operators. In order to establish\nthe efficacy of BertRLFuzzer we compare it against a total of 13 black box and\nwhite box fuzzers over a benchmark of 9 victim websites with over 16K LOC. We\nobserved a significant improvement relative to the nearest competing tool in\nterms of time to first attack (54% less), new vulnerabilities found (17 new\nvulnerabilities), and attack rate (4.4% more attack vectors generated).\n","authors":["Piyush Jha","Joseph Scott","Jaya Sriram Ganeshna","Mudit Singh","Vijay Ganesh"],"pdf_url":"https://arxiv.org/pdf/2305.12534v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09949v3","updated":"2023-11-29T16:34:49Z","published":"2023-10-15T20:57:25Z","title":"Chameleon: a heterogeneous and disaggregated accelerator system for\n retrieval-augmented language models","summary":" A Retrieval-Augmented Language Model (RALM) augments a generative language\nmodel by retrieving context-specific knowledge from an external database. This\nstrategy facilitates impressive text generation quality even with smaller\nmodels, thus reducing orders of magnitude of computational demands. However,\nRALMs introduce unique system design challenges due to (a) the diverse workload\ncharacteristics between LM inference and retrieval and (b) the various system\nrequirements and bottlenecks for different RALM configurations such as model\nsizes, database sizes, and retrieval frequencies. We propose Chameleon, a\nheterogeneous accelerator system that integrates both LM and retrieval\naccelerators in a disaggregated architecture. The heterogeneity ensures\nefficient acceleration of both LM inference and retrieval, while the\naccelerator disaggregation enables the system to independently scale both types\nof accelerators to fulfill diverse RALM requirements. Our Chameleon prototype\nimplements retrieval accelerators on FPGAs and assigns LM inference to GPUs,\nwith a CPU server orchestrating these accelerators over the network. Compared\nto CPU-based and CPU-GPU vector search systems, Chameleon achieves up to 23.72x\nspeedup and 26.2x energy efficiency. Evaluated on various RALMs, Chameleon\nexhibits up to 2.16x reduction in latency and 3.18x speedup in throughput\ncompared to the hybrid CPU-GPU architecture. These promising results pave the\nway for bringing accelerator heterogeneity and disaggregation into future RALM\nsystems.\n","authors":["Wenqi Jiang","Marco Zeller","Roger Waleffe","Torsten Hoefler","Gustavo Alonso"],"pdf_url":"https://arxiv.org/pdf/2310.09949v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17781v1","updated":"2023-11-29T16:26:24Z","published":"2023-11-29T16:26:24Z","title":"Propagate & Distill: Towards Effective Graph Learners Using\n Propagation-Embracing MLPs","summary":" Recent studies attempted to utilize multilayer perceptrons (MLPs) to solve\nsemisupervised node classification on graphs, by training a student MLP by\nknowledge distillation from a teacher graph neural network (GNN). While\nprevious studies have focused mostly on training the student MLP by matching\nthe output probability distributions between the teacher and student models\nduring distillation, it has not been systematically studied how to inject the\nstructural information in an explicit and interpretable manner. Inspired by\nGNNs that separate feature transformation $T$ and propagation $\\Pi$, we\nre-frame the distillation process as making the student MLP learn both $T$ and\n$\\Pi$. Although this can be achieved by applying the inverse propagation\n$\\Pi^{-1}$ before distillation from the teacher, it still comes with a high\ncomputational cost from large matrix multiplications during training. To solve\nthis problem, we propose Propagate & Distill (P&D), which propagates the output\nof the teacher before distillation, which can be interpreted as an approximate\nprocess of the inverse propagation. We demonstrate that P&D can readily improve\nthe performance of the student MLP.\n","authors":["Yong-Min Shin","Won-Yong Shin"],"pdf_url":"https://arxiv.org/pdf/2311.17781v1.pdf","comment":"17 pages, 2 figures, 8 tables; 2nd Learning on Graphs Conference (LoG\n 2023) (Please cite our conference version.). arXiv admin note: substantial\n text overlap with arXiv:2311.11759"},{"id":"http://arxiv.org/abs/2311.17778v1","updated":"2023-11-29T16:24:32Z","published":"2023-11-29T16:24:32Z","title":"Unified Binary and Multiclass Margin-Based Classification","summary":" The notion of margin loss has been central to the development and analysis of\nalgorithms for binary classification. To date, however, there remains no\nconsensus as to the analogue of the margin loss for multiclass classification.\nIn this work, we show that a broad range of multiclass loss functions,\nincluding many popular ones, can be expressed in the relative margin form, a\ngeneralization of the margin form of binary losses. The relative margin form is\nbroadly useful for understanding and analyzing multiclass losses as shown by\nour prior work (Wang and Scott, 2020, 2021). To further demonstrate the utility\nof this way of expressing multiclass losses, we use it to extend the seminal\nresult of Bartlett et al. (2006) on classification-calibration of binary margin\nlosses to multiclass. We then analyze the class of Fenchel-Young losses, and\nexpand the set of these losses that are known to be classification-calibrated.\n","authors":["Yutong Wang","Clayton Scott"],"pdf_url":"https://arxiv.org/pdf/2311.17778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13903v3","updated":"2023-11-29T16:19:21Z","published":"2023-07-26T02:02:19Z","title":"Corruption-Robust Lipschitz Contextual Search","summary":" I study the problem of learning a Lipschitz function with corrupted binary\nsignals. The learner tries to learn a $L$-Lipschitz function $f: [0,1]^d\n\\rightarrow [0, L]$ that the adversary chooses. There is a total of $T$ rounds.\nIn each round $t$, the adversary selects a context vector $x_t$ in the input\nspace, and the learner makes a guess to the true function value $f(x_t)$ and\nreceives a binary signal indicating whether the guess is high or low. In a\ntotal of $C$ rounds, the signal may be corrupted, though the value of $C$ is\n\\emph{unknown} to the learner. The learner's goal is to incur a small\ncumulative loss. This work introduces the new algorithmic technique\n\\emph{agnostic checking} as well as new analysis techniques. I design\nalgorithms which: for the symmetric loss, the learner achieves regret $L\\cdot\nO(C\\log T)$ with $d = 1$ and $L\\cdot O_d(C\\log T + T^{(d-1)/d})$ with $d > 1$;\nfor the pricing loss, the learner achieves regret $L\\cdot \\widetilde{O}\n(T^{d/(d+1)} + C\\cdot T^{1/(d+1)})$.\n","authors":["Shiliang Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.13903v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14675v2","updated":"2023-11-29T16:19:16Z","published":"2023-10-30T20:03:34Z","title":"Fast and Expressive Gesture Recognition using a Combination-Homomorphic\n Electromyogram Encoder","summary":" We study the task of gesture recognition from electromyography (EMG), with\nthe goal of enabling expressive human-computer interaction at high accuracy,\nwhile minimizing the time required for new subjects to provide calibration\ndata. To fulfill these goals, we define combination gestures consisting of a\ndirection component and a modifier component. New subjects only demonstrate the\nsingle component gestures and we seek to extrapolate from these to all possible\nsingle or combination gestures. We extrapolate to unseen combination gestures\nby combining the feature vectors of real single gestures to produce synthetic\ntraining data. This strategy allows us to provide a large and flexible gesture\nvocabulary, while not requiring new subjects to demonstrate combinatorially\nmany example gestures. We pre-train an encoder and a combination operator using\nself-supervision, so that we can produce useful synthetic training data for\nunseen test subjects. To evaluate the proposed method, we collect a real-world\nEMG dataset, and measure the effect of augmented supervision against two\nbaselines: a partially-supervised model trained with only single gesture data\nfrom the unseen subject, and a fully-supervised model trained with real single\nand real combination gesture data from the unseen subject. We find that the\nproposed method provides a dramatic improvement over the partially-supervised\nmodel, and achieves a useful classification accuracy that in some cases\napproaches the performance of the fully-supervised model.\n","authors":["Niklas Smedemark-Margulies","Yunus Bicer","Elifnur Sunger","Tales Imbiriba","Eugene Tunik","Deniz Erdogmus","Mathew Yarossi","Robin Walters"],"pdf_url":"https://arxiv.org/pdf/2311.14675v2.pdf","comment":"24 pages, 7 figures, 6 tables V2: add link to code, fix bibliography"},{"id":"http://arxiv.org/abs/2309.03060v2","updated":"2023-11-29T16:17:26Z","published":"2023-09-06T14:59:38Z","title":"CoLA: Exploiting Compositional Structure for Automatic and Efficient\n Numerical Linear Algebra","summary":" Many areas of machine learning and science involve large linear algebra\nproblems, such as eigendecompositions, solving linear systems, computing matrix\nexponentials, and trace estimation. The matrices involved often have Kronecker,\nconvolutional, block diagonal, sum, or product structure. In this paper, we\npropose a simple but general framework for large-scale linear algebra problems\nin machine learning, named CoLA (Compositional Linear Algebra). By combining a\nlinear operator abstraction with compositional dispatch rules, CoLA\nautomatically constructs memory and runtime efficient numerical algorithms.\nMoreover, CoLA provides memory efficient automatic differentiation, low\nprecision computation, and GPU acceleration in both JAX and PyTorch, while also\naccommodating new objects, operations, and rules in downstream packages via\nmultiple dispatch. CoLA can accelerate many algebraic operations, while making\nit easy to prototype matrix structures and algorithms, providing an appealing\ndrop-in tool for virtually any computational effort that requires linear\nalgebra. We showcase its efficacy across a broad range of applications,\nincluding partial differential equations, Gaussian processes, equivariant model\nconstruction, and unsupervised learning.\n","authors":["Andres Potapczynski","Marc Finzi","Geoff Pleiss","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2309.03060v2.pdf","comment":"Code available at https://github.com/wilson-labs/cola. NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17750v1","updated":"2023-11-29T15:54:15Z","published":"2023-11-29T15:54:15Z","title":"Addressing Membership Inference Attack in Federated Learning with Model\n Compression","summary":" Federated Learning (FL) has been proposed as a privacy-preserving solution\nfor machine learning. However, recent works have shown that Federated Learning\ncan leak private client data through membership attacks. In this paper, we show\nthat the effectiveness of these attacks on the clients negatively correlates\nwith the size of the client datasets and model complexity. Based on this\nfinding, we propose model-agnostic Federated Learning as a privacy-enhancing\nsolution because it enables the use of models of varying complexity in the\nclients. To this end, we present $\\texttt{MaPP-FL}$, a novel privacy-aware FL\napproach that leverages model compression on the clients while keeping a full\nmodel on the server. We compare the performance of $\\texttt{MaPP-FL}$ against\nstate-of-the-art model-agnostic FL methods on the CIFAR-10, CIFAR-100, and\nFEMNIST vision datasets. Our experiments show the effectiveness of\n$\\texttt{MaPP-FL}$ in preserving the clients' and the server's privacy while\nachieving competitive classification accuracies.\n","authors":["Gergely Dániel Németh","Miguel Ángel Lozano","Novi Quadrianto","Nuria Oliver"],"pdf_url":"https://arxiv.org/pdf/2311.17750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01628v3","updated":"2023-11-29T15:46:36Z","published":"2023-04-04T08:33:13Z","title":"Equivariant Parameter Sharing for Porous Crystalline Materials","summary":" Efficiently predicting properties of porous crystalline materials has great\npotential to accelerate the high throughput screening process for developing\nnew materials, as simulations carried out using first principles model are\noften computationally expensive. To effectively make use of Deep Learning\nmethods to model these materials, we need to utilize the symmetries present in\nthe crystals, which are defined by their space group. Existing methods for\ncrystal property prediction either have symmetry constraints that are too\nrestrictive or only incorporate symmetries between unit cells. In addition,\nthese models do not explicitly model the porous structure of the crystal. In\nthis paper, we develop a model which incorporates the symmetries of the unit\ncell of a crystal in its architecture and explicitly models the porous\nstructure. We evaluate our model by predicting the heat of adsorption of CO$_2$\nfor different configurations of the mordenite zeolite. Our results confirm that\nour method performs better than existing methods for crystal property\nprediction and that the inclusion of pores results in a more efficient model.\n","authors":["Marko Petković","Pablo Romero-Marimon","Vlado Menkovski","Sofia Calero"],"pdf_url":"https://arxiv.org/pdf/2304.01628v3.pdf","comment":"Additional results"},{"id":"http://arxiv.org/abs/2311.15940v2","updated":"2023-11-29T15:46:23Z","published":"2023-11-27T15:47:33Z","title":"Physics-informed neural networks for transformed geometries and\n manifolds","summary":" Physics-informed neural networks (PINNs) effectively embed physical\nprinciples into machine learning, but often struggle with complex or\nalternating geometries. We propose a novel method for integrating geometric\ntransformations within PINNs to robustly accommodate geometric variations. Our\nmethod incorporates a diffeomorphism as a mapping of a reference domain and\nadapts the derivative computation of the physics-informed loss function. This\ngeneralizes the applicability of PINNs not only to smoothly deformed domains,\nbut also to lower-dimensional manifolds and allows for direct shape\noptimization while training the network. We demonstrate the effectivity of our\napproach on several problems: (i) Eikonal equation on Archimedean spiral, (ii)\nPoisson problem on surface manifold, (iii) Incompressible Stokes flow in\ndeformed tube, and (iv) Shape optimization with Laplace operator. Through these\nexamples, we demonstrate the enhanced flexibility over traditional PINNs,\nespecially under geometric variations. The proposed framework presents an\noutlook for training deep neural operators over parametrized geometries, paving\nthe way for advanced modeling with PDEs on complex geometries in science and\nengineering.\n","authors":["Samuel Burbulla"],"pdf_url":"https://arxiv.org/pdf/2311.15940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17740v1","updated":"2023-11-29T15:44:00Z","published":"2023-11-29T15:44:00Z","title":"A transductive few-shot learning approach for classification of digital\n histopathological slides from liver cancer","summary":" This paper presents a new approach for classifying 2D histopathology patches\nusing few-shot learning. The method is designed to tackle a significant\nchallenge in histopathology, which is the limited availability of labeled data.\nBy applying a sliding window technique to histopathology slides, we illustrate\nthe practical benefits of transductive learning (i.e., making joint predictions\non patches) to achieve consistent and accurate classification. Our approach\ninvolves an optimization-based strategy that actively penalizes the prediction\nof a large number of distinct classes within each window. We conducted\nexperiments on histopathological data to classify tissue classes in digital\nslides of liver cancer, specifically hepatocellular carcinoma. The initial\nresults show the effectiveness of our method and its potential to enhance the\nprocess of automated cancer diagnosis and treatment, all while reducing the\ntime and effort required for expert annotation.\n","authors":["Aymen Sadraoui","Ségolène Martin","Eliott Barbot","Astrid Laurent-Bellue","Jean-Christophe Pesquet","Catherine Guettier","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2311.17740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17722v1","updated":"2023-11-29T15:21:35Z","published":"2023-11-29T15:21:35Z","title":"SenTest: Evaluating Robustness of Sentence Encoders","summary":" Contrastive learning has proven to be an effective method for pre-training\nmodels using weakly labeled data in the vision domain. Sentence transformers\nare the NLP counterparts to this architecture, and have been growing in\npopularity due to their rich and effective sentence representations. Having\neffective sentence representations is paramount in multiple tasks, such as\ninformation retrieval, retrieval augmented generation (RAG), and sentence\ncomparison. Keeping in mind the deployability factor of transformers,\nevaluating the robustness of sentence transformers is of utmost importance.\nThis work focuses on evaluating the robustness of the sentence encoders. We\nemploy several adversarial attacks to evaluate its robustness. This system uses\ncharacter-level attacks in the form of random character substitution,\nword-level attacks in the form of synonym replacement, and sentence-level\nattacks in the form of intra-sentence word order shuffling. The results of the\nexperiments strongly undermine the robustness of sentence encoders. The models\nproduce significantly different predictions as well as embeddings on perturbed\ndatasets. The accuracy of the models can fall up to 15 percent on perturbed\ndatasets as compared to unperturbed datasets. Furthermore, the experiments\ndemonstrate that these embeddings does capture the semantic and syntactic\nstructure (sentence order) of sentences. However, existing supervised\nclassification strategies fail to leverage this information, and merely\nfunction as n-gram detectors.\n","authors":["Tanmay Chavan","Shantanu Patankar","Aditya Kane","Omkar Gokhale","Geetanjali Kale","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2311.17722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17717v1","updated":"2023-11-29T15:19:49Z","published":"2023-11-29T15:19:49Z","title":"Receler: Reliable Concept Erasing of Text-to-Image Diffusion Models via\n Lightweight Erasers","summary":" Concept erasure in text-to-image diffusion models aims to disable pre-trained\ndiffusion models from generating images related to a target concept. To perform\nreliable concept erasure, the properties of robustness and locality are\ndesirable. The former refrains the model from producing images associated with\nthe target concept for any paraphrased or learned prompts, while the latter\npreserves the model ability in generating images for non-target concepts. In\nthis paper, we propose Reliable Concept Erasing via Lightweight Erasers\n(Receler), which learns a lightweight Eraser to perform concept erasing and\nenhances locality and robustness with the proposed concept-localized\nregularization and adversarial prompt learning, respectively. Comprehensive\nquantitative and qualitative experiments with various concept prompts verify\nthe superiority of Receler over the previous erasing methods on the above two\ndesirable properties.\n","authors":["Chi-Pin Huang","Kai-Po Chang","Chung-Ting Tsai","Yung-Hsuan Lai","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12754v2","updated":"2023-11-29T15:19:38Z","published":"2023-11-21T17:59:14Z","title":"SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction","summary":" 3D occupancy prediction is an important task for the robustness of\nvision-centric autonomous driving, which aims to predict whether each point is\noccupied in the surrounding 3D space. Existing methods usually require 3D\noccupancy labels to produce meaningful results. However, it is very laborious\nto annotate the occupancy status of each voxel. In this paper, we propose\nSelfOcc to explore a self-supervised way to learn 3D occupancy using only video\nsequences. We first transform the images into the 3D space (e.g., bird's eye\nview) to obtain 3D representation of the scene. We directly impose constraints\non the 3D representations by treating them as signed distance fields. We can\nthen render 2D images of previous and future frames as self-supervision signals\nto learn the 3D representations. We propose an MVS-embedded strategy to\ndirectly optimize the SDF-induced weights with multiple depth proposals. Our\nSelfOcc outperforms the previous best method SceneRF by 58.7% using a single\nframe as input on SemanticKITTI and is the first self-supervised work that\nproduces reasonable 3D occupancy for surround cameras on nuScenes. SelfOcc\nproduces high-quality depth and achieves state-of-the-art results on novel\ndepth synthesis, monocular depth estimation, and surround-view depth estimation\non the SemanticKITTI, KITTI-2015, and nuScenes, respectively. Code:\nhttps://github.com/huang-yh/SelfOcc.\n","authors":["Yuanhui Huang","Wenzhao Zheng","Borui Zhang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2311.12754v2.pdf","comment":"Code is available at: https://github.com/huang-yh/SelfOcc"},{"id":"http://arxiv.org/abs/2310.05703v3","updated":"2023-11-29T15:12:00Z","published":"2023-10-09T13:24:44Z","title":"An Attribution Method for Siamese Encoders","summary":" Despite the success of Siamese encoder models such as sentence transformers\n(ST), little is known about the aspects of inputs they pay attention to. A\nbarrier is that their predictions cannot be attributed to individual features,\nas they compare two inputs rather than processing a single one. This paper\nderives a local attribution method for Siamese encoders by generalizing the\nprinciple of integrated gradients to models with multiple inputs. The solution\ntakes the form of feature-pair attributions, and can be reduced to a\ntoken-token matrix for STs. Our method involves the introduction of integrated\nJacobians and inherits the advantageous formal properties of integrated\ngradients: it accounts for the model's full computation graph and is guaranteed\nto converge to the actual prediction. A pilot study shows that in an ST few\ntoken-pairs can often explain large fractions of predictions, and it focuses on\nnouns and verbs. For accurate predictions, it however needs to attend to the\nmajority of tokens and parts of speech.\n","authors":["Lucas Möller","Dmitry Nikolaev","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2310.05703v3.pdf","comment":"Accepted to EMNLP'23"},{"id":"http://arxiv.org/abs/2310.01929v2","updated":"2023-11-29T15:11:02Z","published":"2023-10-03T10:13:36Z","title":"Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of\n Text-To-Image Models","summary":" Text-To-Image (TTI) models, such as DALL-E and StableDiffusion, have\ndemonstrated remarkable prompt-based image generation capabilities.\nMultilingual encoders may have a substantial impact on the cultural agency of\nthese models, as language is a conduit of culture. In this study, we explore\nthe cultural perception embedded in TTI models by characterizing culture across\nthree hierarchical tiers: cultural dimensions, cultural domains, and cultural\nconcepts. Based on this ontology, we derive prompt templates to unlock the\ncultural knowledge in TTI models, and propose a comprehensive suite of\nevaluation techniques, including intrinsic evaluations using the CLIP space,\nextrinsic evaluations with a Visual-Question-Answer (VQA) model and human\nassessments, to evaluate the cultural content of TTI-generated images. To\nbolster our research, we introduce the CulText2I dataset, derived from four\ndiverse TTI models and spanning ten languages. Our experiments provide insights\nregarding Do, What, Which and How research questions about the nature of\ncultural encoding in TTI models, paving the way for cross-cultural applications\nof these models.\n","authors":["Mor Ventura","Eyal Ben-David","Anna Korhonen","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2310.01929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17695v1","updated":"2023-11-29T15:02:01Z","published":"2023-11-29T15:02:01Z","title":"Fair Text-to-Image Diffusion via Fair Mapping","summary":" In this paper, we address the limitations of existing text-to-image diffusion\nmodels in generating demographically fair results when given human-related\ndescriptions. These models often struggle to disentangle the target language\ncontext from sociocultural biases, resulting in biased image generation. To\novercome this challenge, we propose Fair Mapping, a general, model-agnostic,\nand lightweight approach that modifies a pre-trained text-to-image model by\ncontrolling the prompt to achieve fair image generation. One key advantage of\nour approach is its high efficiency. The training process only requires\nupdating a small number of parameters in an additional linear mapping network.\nThis not only reduces the computational cost but also accelerates the\noptimization process. We first demonstrate the issue of bias in generated\nresults caused by language biases in text-guided diffusion models. By\ndeveloping a mapping network that projects language embeddings into an unbiased\nspace, we enable the generation of relatively balanced demographic results\nbased on a keyword specified in the prompt. With comprehensive experiments on\nface image generation, we show that our method significantly improves image\ngeneration performance when prompted with descriptions related to human faces.\nBy effectively addressing the issue of bias, we produce more fair and diverse\nimage outputs. This work contributes to the field of text-to-image generation\nby enhancing the ability to generate images that accurately reflect the\nintended demographic characteristics specified in the text.\n","authors":["Jia Li","Lijie Hu","Jingfeng Zhang","Tianhang Zheng","Hua Zhang","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06591v3","updated":"2023-11-29T15:00:42Z","published":"2022-10-12T21:10:55Z","title":"Rigorous dynamical mean field theory for stochastic gradient descent\n methods","summary":" We prove closed-form equations for the exact high-dimensional asymptotics of\na family of first order gradient-based methods, learning an estimator (e.g.\nM-estimator, shallow neural network, ...) from observations on Gaussian data\nwith empirical risk minimization. This includes widely used algorithms such as\nstochastic gradient descent (SGD) or Nesterov acceleration. The obtained\nequations match those resulting from the discretization of dynamical mean-field\ntheory (DMFT) equations from statistical physics when applied to gradient flow.\nOur proof method allows us to give an explicit description of how memory\nkernels build up in the effective dynamics, and to include non-separable update\nfunctions, allowing datasets with non-identity covariance matrices. Finally, we\nprovide numerical implementations of the equations for SGD with generic\nextensive batch-size and with constant learning rates.\n","authors":["Cedric Gerbelot","Emanuele Troiani","Francesca Mignacco","Florent Krzakala","Lenka Zdeborova"],"pdf_url":"https://arxiv.org/pdf/2210.06591v3.pdf","comment":"40 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.17693v1","updated":"2023-11-29T15:00:06Z","published":"2023-11-29T15:00:06Z","title":"Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using\n Reinforcement and Imitation Learning","summary":" Robotic-assisted surgical systems have demonstrated significant potential in\nenhancing surgical precision and minimizing human errors. However, existing\nsystems lack the ability to accommodate the unique preferences and requirements\nof individual surgeons. Additionally, they primarily focus on general surgeries\n(e.g., laparoscopy) and are not suitable for highly precise microsurgeries,\nsuch as ophthalmic procedures. Thus, we propose a simulation-based image-guided\napproach for surgeon-centered autonomous agents that can adapt to the\nindividual surgeon's skill level and preferred surgical techniques during\nophthalmic cataract surgery. Our approach utilizes a simulated environment to\ntrain reinforcement and imitation learning agents guided by image data to\nperform all tasks of the incision phase of cataract surgery. By integrating the\nsurgeon's actions and preferences into the training process with the\nsurgeon-in-the-loop, our approach enables the robot to implicitly learn and\nadapt to the individual surgeon's unique approach through demonstrations. This\nresults in a more intuitive and personalized surgical experience for the\nsurgeon. Simultaneously, it ensures consistent performance for the autonomous\nrobotic apprentice. We define and evaluate the effectiveness of our approach\nusing our proposed metrics; and highlight the trade-off between a generic agent\nand a surgeon-centered adapted agent. Moreover, our approach has the potential\nto extend to other ophthalmic surgical procedures, opening the door to a new\ngeneration of surgeon-in-the-loop autonomous surgical robots. We provide an\nopen-source simulation framework for future development and reproducibility.\n","authors":["Amr Gomaa","Bilal Mahdy","Niko Kleer","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2311.17693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01714v2","updated":"2023-11-29T14:54:04Z","published":"2023-02-03T13:11:57Z","title":"Learning End-to-End Channel Coding with Diffusion Models","summary":" It is a known problem that deep-learning-based end-to-end (E2E) channel\ncoding systems depend on a known and differentiable channel model, due to the\nlearning process and based on the gradient-descent optimization methods. This\nplaces the challenge to approximate or generate the channel or its derivative\nfrom samples generated by pilot signaling in real-world scenarios. Currently,\nthere are two prevalent methods to solve this problem. One is to generate the\nchannel via a generative adversarial network (GAN), and the other is to, in\nessence, approximate the gradient via reinforcement learning methods. Other\nmethods include using score-based methods, variational autoencoders, or\nmutual-information-based methods. In this paper, we focus on generative models\nand, in particular, on a new promising method called diffusion models, which\nhave shown a higher quality of generation in image-based tasks. We will show\nthat diffusion models can be used in wireless E2E scenarios and that they work\nas good as Wasserstein GANs while having a more stable training procedure and a\nbetter generalization ability in testing.\n","authors":["Muah Kim","Rick Fritschek","Rafael F. Schaefer"],"pdf_url":"https://arxiv.org/pdf/2302.01714v2.pdf","comment":"6 pages, WSA/SCC 2023"},{"id":"http://arxiv.org/abs/2311.16883v2","updated":"2023-11-29T14:41:36Z","published":"2023-11-28T15:31:31Z","title":"Compressing the Backward Pass of Large-Scale Neural Architectures by\n Structured Activation Pruning","summary":" The rise of Deep Neural Networks (DNNs) has led to an increase in model size\nand complexity, straining the memory capacity of GPUs. Sparsity in DNNs,\ncharacterized as structural or ephemeral, has gained attention as a solution.\nThis work focuses on ephemeral sparsity, aiming to reduce memory consumption\nduring training. It emphasizes the significance of activations, an often\noverlooked component, and their role in memory usage. This work employs\nstructured pruning in Block Sparse Compressed Row (BSR) format in combination\nwith a magnitude-based criterion to efficiently prune activations. We\nfurthermore introduce efficient block-sparse operators for GPUs and showcase\ntheir effectiveness, as well as the superior compression offered by block\nsparsity. We report the effectiveness of activation pruning by evaluating\ntraining speed, accuracy, and memory usage of large-scale neural architectures\non the example of ResMLP on image classification tasks. As a result, we observe\na memory reduction of up to 32% while maintaining accuracy. Ultimately, our\napproach aims to democratize large-scale model training, reduce GPU\nrequirements, and address ecological concerns.\n","authors":["Daniel Barley","Holger Fröning"],"pdf_url":"https://arxiv.org/pdf/2311.16883v2.pdf","comment":"8 pages, 11 figures, submitted to the 6th AccML workshop at HiPEAC\n conference 2024"},{"id":"http://arxiv.org/abs/2303.15799v3","updated":"2023-11-29T14:41:33Z","published":"2023-03-28T08:07:28Z","title":"FedAgg: Adaptive Federated Learning with Aggregated Gradients","summary":" Federated Learning (FL) has become an emerging norm for distributed model\ntraining, which enables multiple devices cooperatively to train a shared model\nutilizing their own datasets scheduled by a central server while keeping\nprivate data localized. However, during the training process, the\nnon-independent-and-identically-distributed (Non-IID) data generated on\nheterogeneous clients and frequent communication across participants may\nsignificantly influence the training performance, slow down the convergent\nrate, and increase communication consumption. In this paper, we ameliorate the\nstandard stochastic gradient descent approach by introducing the aggregated\ngradients at each local update epoch and propose an adaptive learning rate\niterative algorithm that further takes the deviation between the local\nparameter and global parameter into account. The aforementioned adaptive\nlearning rate design mechanism requires local information of all clients, which\nis challenging as there is no communication during the local update epochs. To\nobtain a decentralized adaptive learning rate for each client, we introduce the\nmean-field approach by utilizing two mean-field terms to estimate the average\nlocal parameters and gradients respectively without exchanging clients' local\ninformation with each other over time. Through theoretical analysis, we prove\nthat our method can provide the convergence guarantee for model training and\nderive a convergent upper bound for the client drifting term. Extensive\nnumerical results show that our proposed framework is superior to the\nstate-of-the-art FL schemes in both model accuracy and convergent rate on\nreal-world datasets with IID and Non-IID data distribution.\n","authors":["Wenhao Yuan","Xuehe Wang"],"pdf_url":"https://arxiv.org/pdf/2303.15799v3.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.03684v3","updated":"2023-11-29T14:39:37Z","published":"2023-10-05T17:01:53Z","title":"SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks","summary":" Despite efforts to align large language models (LLMs) with human values,\nwidely-used LLMs such as GPT, Llama, Claude, and PaLM are susceptible to\njailbreaking attacks, wherein an adversary fools a targeted LLM into generating\nobjectionable content. To address this vulnerability, we propose SmoothLLM, the\nfirst algorithm designed to mitigate jailbreaking attacks on LLMs. Based on our\nfinding that adversarially-generated prompts are brittle to character-level\nchanges, our defense first randomly perturbs multiple copies of a given input\nprompt, and then aggregates the corresponding predictions to detect adversarial\ninputs. SmoothLLM reduces the attack success rate on numerous popular LLMs to\nbelow one percentage point, avoids unnecessary conservatism, and admits\nprovable guarantees on attack mitigation. Moreover, our defense uses\nexponentially fewer queries than existing attacks and is compatible with any\nLLM. Our code is publicly available at the following link:\nhttps://github.com/arobey1/smooth-llm.\n","authors":["Alexander Robey","Eric Wong","Hamed Hassani","George J. Pappas"],"pdf_url":"https://arxiv.org/pdf/2310.03684v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17673v1","updated":"2023-11-29T14:36:33Z","published":"2023-11-29T14:36:33Z","title":"Using Ornstein-Uhlenbeck Process to understand Denoising Diffusion\n Probabilistic Model and its Noise Schedules","summary":" The aim of this short note is to show that Denoising Diffusion Probabilistic\nModel DDPM, a non-homogeneous discrete-time Markov process, can be represented\nby a time-homogeneous continuous-time Markov process observed at non-uniformly\nsampled discrete times. Surprisingly, this continuous-time Markov process is\nthe well-known and well-studied Ornstein-Ohlenbeck (OU) process, which was\ndeveloped in 1930's for studying Brownian particles in Harmonic potentials. We\nestablish the formal equivalence between DDPM and the OU process using its\nanalytical solution. We further demonstrate that the design problem of the\nnoise scheduler for non-homogeneous DDPM is equivalent to designing observation\ntimes for the OU process. We present several heuristic designs for observation\ntimes based on principled quantities such as auto-variance and Fisher\nInformation and connect them to ad hoc noise schedules for DDPM. Interestingly,\nwe show that the Fisher-Information-motivated schedule corresponds exactly the\ncosine schedule, which was developed without any theoretical foundation but is\nthe current state-of-the-art noise schedule.\n","authors":["Javier E. Santos","Yen Ting Lin"],"pdf_url":"https://arxiv.org/pdf/2311.17673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17462v2","updated":"2023-11-29T14:33:28Z","published":"2023-10-26T15:10:10Z","title":"Towards Learning Monocular 3D Object Localization From 2D Labels using\n the Physical Laws of Motion","summary":" We present a novel method for precise 3D object localization in single images\nfrom a single calibrated camera using only 2D labels. No expensive 3D labels\nare needed. Thus, instead of using 3D labels, our model is trained with\neasy-to-annotate 2D labels along with the physical knowledge of the object's\nmotion. Given this information, the model can infer the latent third dimension,\neven though it has never seen this information during training. Our method is\nevaluated on both synthetic and real-world datasets, and we are able to achieve\na mean distance error of just 6 cm in our experiments on real data. The results\nindicate the method's potential as a step towards learning 3D object location\nestimation, where collecting 3D data for training is not feasible.\n","authors":["Daniel Kienzle","Julian Lorenz","Katja Ludwig","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2310.17462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19583v2","updated":"2023-11-29T14:33:09Z","published":"2023-10-30T14:41:53Z","title":"GC-MVSNet: Multi-View, Multi-Scale, Geometrically-Consistent Multi-View\n Stereo","summary":" Traditional multi-view stereo (MVS) methods rely heavily on photometric and\ngeometric consistency constraints, but newer machine learning-based MVS methods\ncheck geometric consistency across multiple source views only as a\npost-processing step. In this paper, we present a novel approach that\nexplicitly encourages geometric consistency of reference view depth maps across\nmultiple source views at different scales during learning (see Fig. 1). We find\nthat adding this geometric consistency loss significantly accelerates learning\nby explicitly penalizing geometrically inconsistent pixels, reducing the\ntraining iteration requirements to nearly half that of other MVS methods. Our\nextensive experiments show that our approach achieves a new state-of-the-art on\nthe DTU and BlendedMVS datasets, and competitive results on the Tanks and\nTemples benchmark. To the best of our knowledge, GC-MVSNet is the first attempt\nto enforce multi-view, multi-scale geometric consistency during learning.\n","authors":["Vibhas K. Vats","Sripad Joshi","David J. Crandall","Md. Alimoor Reza","Soon-heung Jung"],"pdf_url":"https://arxiv.org/pdf/2310.19583v2.pdf","comment":"Accepted in WACV 2024"},{"id":"http://arxiv.org/abs/2310.03605v3","updated":"2023-11-29T14:30:29Z","published":"2023-10-05T15:36:35Z","title":"FASER: Binary Code Similarity Search through the use of Intermediate\n Representations","summary":" Being able to identify functions of interest in cross-architecture software\nis useful whether you are analysing for malware, securing the software supply\nchain or conducting vulnerability research. Cross-Architecture Binary Code\nSimilarity Search has been explored in numerous studies and has used a wide\nrange of different data sources to achieve its goals. The data sources\ntypically used draw on common structures derived from binaries such as function\ncontrol flow graphs or binary level call graphs, the output of the disassembly\nprocess or the outputs of a dynamic analysis approach. One data source which\nhas received less attention is binary intermediate representations. Binary\nIntermediate representations possess two interesting properties: they are cross\narchitecture by their very nature and encode the semantics of a function\nexplicitly to support downstream usage. Within this paper we propose Function\nas a String Encoded Representation (FASER) which combines long document\ntransformers with the use of intermediate representations to create a model\ncapable of cross architecture function search without the need for manual\nfeature engineering, pre-training or a dynamic analysis step. We compare our\napproach against a series of baseline approaches for two tasks; A general\nfunction search task and a targeted vulnerability search task. Our approach\ndemonstrates strong performance across both tasks, performing better than all\nbaseline approaches.\n","authors":["Josh Collyer","Tim Watson","Iain Phillips"],"pdf_url":"https://arxiv.org/pdf/2310.03605v3.pdf","comment":"10 pages, Proceedings of the Conference on Applied Machine Learning\n in Information Security (CAMLIS)"},{"id":"http://arxiv.org/abs/2308.03818v2","updated":"2023-11-29T14:20:59Z","published":"2023-08-07T14:28:59Z","title":"A sparse coding approach to inverse problems with application to\n microwave tomography","summary":" Inverse imaging problems that are ill-posed can be encountered across\nmultiple domains of science and technology, ranging from medical diagnosis to\nastronomical studies. To reconstruct images from incomplete and distorted data,\nit is necessary to create algorithms that can take into account both, the\nphysical mechanisms responsible for generating these measurements and the\nintrinsic characteristics of the images being analyzed. In this work, the\nsparse representation of images is reviewed, which is a realistic, compact and\neffective generative model for natural images inspired by the visual system of\nmammals. It enables us to address ill-posed linear inverse problems by training\nthe model on a vast collection of images. Moreover, we extend the application\nof sparse coding to solve the non-linear and ill-posed problem in microwave\ntomography imaging, which could lead to a significant improvement of the\nstate-of-the-arts algorithms.\n","authors":["Cesar F. Caiafa","Ramiro M. Irastorza"],"pdf_url":"https://arxiv.org/pdf/2308.03818v2.pdf","comment":"submitted to RevMexAA (conference series)"},{"id":"http://arxiv.org/abs/2306.03436v2","updated":"2023-11-29T14:10:59Z","published":"2023-06-06T06:31:07Z","title":"Intellectual Property Protection of Diffusion Models via the Watermark\n Diffusion Process","summary":" Diffusion models have rapidly become a vital part of deep generative\narchitectures, given today's increasing demands. Obtaining large,\nhigh-performance diffusion models demands significant resources, highlighting\ntheir importance as intellectual property worth protecting. However, existing\nwatermarking techniques for ownership verification are insufficient when\napplied to diffusion models. Very recent research in watermarking diffusion\nmodels either exposes watermarks during task generation, which harms the\nimperceptibility, or is developed for conditional diffusion models that require\nprompts to trigger the watermark. This paper introduces WDM, a novel\nwatermarking solution for diffusion models without imprinting the watermark\nduring task generation. It involves training a model to concurrently learn a\nWatermark Diffusion Process (WDP) for embedding watermarks alongside the\nstandard diffusion process for task generation. We provide a detailed\ntheoretical analysis of WDP training and sampling, relating it to a shifted\nGaussian diffusion process via the same reverse noise. Extensive experiments\nare conducted to validate the effectiveness and robustness of our approach in\nvarious trigger and watermark data configurations.\n","authors":["Sen Peng","Yufei Chen","Cong Wang","Xiaohua Jia"],"pdf_url":"https://arxiv.org/pdf/2306.03436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17646v1","updated":"2023-11-29T14:08:26Z","published":"2023-11-29T14:08:26Z","title":"A novel feature selection method based on quantum support vector machine","summary":" Feature selection is critical in machine learning to reduce dimensionality\nand improve model accuracy and efficiency. The exponential growth in feature\nspace dimensionality for modern datasets directly results in ambiguous samples\nand redundant features, which can severely degrade classification accuracy.\nQuantum machine learning offers potential advantages for addressing this\nchallenge. In this paper, we propose a novel method, quantum support vector\nmachine feature selection (QSVMF), integrating quantum support vector machines\nwith multi-objective genetic algorithm. QSVMF optimizes multiple simultaneous\nobjectives: maximizing classification accuracy, minimizing selected features\nand quantum circuit costs, and reducing feature covariance. We apply QSVMF for\nfeature selection on a breast cancer dataset, comparing the performance of\nQSVMF against classical approaches with the selected features. Experimental\nresults show that QSVMF achieves superior performance. Furthermore, The Pareto\nfront solutions of QSVMF enable analysis of accuracy versus feature set size\ntrade-offs, identifying extremely sparse yet accurate feature subsets. We\ncontextualize the biological relevance of the selected features in terms of\nknown breast cancer biomarkers. This work highlights the potential of\nquantum-based feature selection to enhance machine learning efficiency and\nperformance on complex real-world data.\n","authors":["Haiyan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17633v1","updated":"2023-11-29T13:51:04Z","published":"2023-11-29T13:51:04Z","title":"Introduction to Transformers: an NLP Perspective","summary":" Transformers have dominated empirical machine learning models of natural\nlanguage processing. In this paper, we introduce basic concepts of Transformers\nand present key techniques that form the recent advances of these models. This\nincludes a description of the standard Transformer architecture, a series of\nmodel refinements, and common applications. Given that Transformers and related\ndeep learning techniques might be evolving in ways we have never seen, we\ncannot dive into all the model details or cover all the technical areas.\nInstead, we focus on just those concepts that are helpful for gaining a good\nunderstanding of Transformers and their variants. We also summarize the key\nideas that impact this field, thereby yielding some insights into the strengths\nand limitations of these models.\n","authors":["Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.17633v1.pdf","comment":"119 pages and 21 figures"},{"id":"http://arxiv.org/abs/2305.10361v3","updated":"2023-11-29T13:46:53Z","published":"2023-05-17T16:38:11Z","title":"Human Choice Prediction in Language-based Non-Cooperative Games:\n Simulation-based Off-Policy Evaluation","summary":" Persuasion games have been fundamental in economics and AI research, and have\nsignificant practical applications. Recent works in this area have started to\nincorporate natural language, moving beyond the traditional stylized message\nsetting. However, previous research has focused on on-policy prediction, where\nthe train and test data have the same distribution, which is not representative\nof real-life scenarios. In this paper, we tackle the challenging problem of\noff-policy evaluation (OPE) in language-based persuasion games. To address the\ninherent difficulty of human data collection in this setup, we propose a novel\napproach which combines real and simulated human-bot interaction data. Our\nsimulated data is created by an exogenous model assuming decision makers (DMs)\nstart with a mixture of random and decision-theoretic based behaviors and\nimprove over time. We present a deep learning training algorithm that\neffectively integrates real interaction and simulated data, substantially\nimproving over models that train only with interaction data. Our results\ndemonstrate the potential of real interaction and simulation mixtures as a\ncost-effective and scalable solution for OPE in language-based persuasion\ngames. Our code and the large dataset we collected and generated are submitted\nas supplementary material and publicly available in our GitHub repository:\nhttps://github.com/eilamshapira/HumanChoicePrediction\n","authors":["Eilam Shapira","Reut Apel","Moshe Tennenholtz","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2305.10361v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17631v1","updated":"2023-11-29T13:45:07Z","published":"2023-11-29T13:45:07Z","title":"Q-learning Based Optimal False Data Injection Attack on Probabilistic\n Boolean Control Networks","summary":" In this paper, we present a reinforcement learning (RL) method for solving\noptimal false data injection attack problems in probabilistic Boolean control\nnetworks (PBCNs) where the attacker lacks knowledge of the system model.\nSpecifically, we employ a Q-learning (QL) algorithm to address this problem. We\nthen propose an improved QL algorithm that not only enhances learning\nefficiency but also obtains optimal attack strategies for large-scale PBCNs\nthat the standard QL algorithm cannot handle. Finally, we verify the\neffectiveness of our proposed approach by considering two attacked PBCNs,\nincluding a 10-node network and a 28-node network.\n","authors":["Xianlun Peng","Yang Tang","Fangfei Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16519v2","updated":"2023-11-29T13:38:17Z","published":"2023-11-28T04:58:17Z","title":"B-LSTM-MIONet: Bayesian LSTM-based Neural Operators for Learning the\n Response of Complex Dynamical Systems to Length-Variant Multiple Input\n Functions","summary":" Deep Operator Network (DeepONet) is a neural network framework for learning\nnonlinear operators such as those from ordinary differential equations (ODEs)\ndescribing complex systems. Multiple-input deep neural operators (MIONet)\nextended DeepONet to allow multiple input functions in different Banach spaces.\nMIONet offers flexibility in training dataset grid spacing, without constraints\non output location. However, it requires offline inputs and cannot handle\nvarying sequence lengths in testing datasets, limiting its real-time\napplication in dynamic complex systems. This work redesigns MIONet, integrating\nLong Short Term Memory (LSTM) to learn neural operators from time-dependent\ndata. This approach overcomes data discretization constraints and harnesses\nLSTM's capability with variable-length, real-time data. Factors affecting\nlearning performance, like algorithm extrapolation ability are presented. The\nframework is enhanced with uncertainty quantification through a novel Bayesian\nmethod, sampling from MIONet parameter distributions. Consequently, we develop\nthe B-LSTM-MIONet, incorporating LSTM's temporal strengths with Bayesian\nrobustness, resulting in a more precise and reliable model for noisy datasets.\n","authors":["Zhihao Kong","Amirhossein Mollaali","Christian Moya","Na Lu","Guang Lin"],"pdf_url":"https://arxiv.org/pdf/2311.16519v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16834v2","updated":"2023-11-29T13:23:42Z","published":"2023-11-28T14:51:06Z","title":"Modular Neural Networks for Time Series Forecasting: Interpretability\n and Feature Selection using Attention","summary":" Multivariate time series have many applications, from healthcare and\nmeteorology to life science. Although deep learning models have shown excellent\npredictive performance for time series, they have been criticised for being\n\"black-boxes\" or non-interpretable. This paper proposes a novel modular neural\nnetwork model for multivariate time series prediction that is interpretable by\nconstruction. A recurrent neural network learns the temporal dependencies in\nthe data while an attention-based feature selection component selects the most\nrelevant features and suppresses redundant features used in the learning of the\ntemporal dependencies. A modular deep network is trained from the selected\nfeatures independently to show the users how features influence outcomes,\nmaking the model interpretable. Experimental results show that this approach\ncan outperform state-of-the-art interpretable Neural Additive Models (NAM) and\nvariations thereof in both regression and classification of time series tasks,\nachieving a predictive performance that is comparable to the top\nnon-interpretable methods for time series, LSTM and XGBoost.\n","authors":["Qiqi Su","Christos Kloukinas","Artur d'Avila Garcez"],"pdf_url":"https://arxiv.org/pdf/2311.16834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03579v2","updated":"2023-11-29T13:21:52Z","published":"2023-09-07T09:18:12Z","title":"DTW+S: Shape-based Comparison of Time-series with Ordered Local Trend","summary":" Measuring distance or similarity between time-series data is a fundamental\naspect of many applications including classification, clustering, and\nensembling/alignment. Existing measures may fail to capture similarities among\nlocal trends (shapes) and may even produce misleading results. Our goal is to\ndevelop a measure that looks for similar trends occurring around similar times\nand is easily interpretable for researchers in applied domains. This is\nparticularly useful for applications where time-series have a sequence of\nmeaningful local trends that are ordered, such as in epidemics (a surge to an\nincrease to a peak to a decrease). We propose a novel measure, DTW+S, which\ncreates an interpretable \"closeness-preserving\" matrix representation of the\ntime-series, where each column represents local trends, and then it applies\nDynamic Time Warping to compute distances between these matrices. We present a\ntheoretical analysis that supports the choice of this representation. We\ndemonstrate the utility of DTW+S in several tasks. For the clustering of\nepidemic curves, we show that DTW+S is the only measure able to produce good\nclustering compared to the baselines. For ensemble building, we propose a\ncombination of DTW+S and barycenter averaging that results in the best\npreservation of characteristics of the underlying trajectories. We also\ndemonstrate that our approach results in better classification compared to\nDynamic Time Warping for a class of datasets, particularly when local trends\nrather than scale play a decisive role.\n","authors":["Ajitesh Srivastava"],"pdf_url":"https://arxiv.org/pdf/2309.03579v2.pdf","comment":"11 pages, 11 figures Update: Included barycenter averaging with DTW+S\n along with results"},{"id":"http://arxiv.org/abs/2311.08569v2","updated":"2023-11-29T13:20:53Z","published":"2023-11-14T22:14:07Z","title":"Uncertainty Quantification in Neural-Network Based Pain Intensity\n Estimation","summary":" Improper pain management can lead to severe physical or mental consequences,\nincluding suffering, and an increased risk of opioid dependency. Assessing the\npresence and severity of pain is imperative to prevent such outcomes and\ndetermine the appropriate intervention. However, the evaluation of pain\nintensity is challenging because different individuals experience pain\ndifferently. To overcome this, researchers have employed machine learning\nmodels to evaluate pain intensity objectively. However, these efforts have\nprimarily focused on point estimation of pain, disregarding the inherent\nuncertainty and variability present in the data and model. Consequently, the\npoint estimates provide only partial information for clinical decision-making.\nThis study presents a neural network-based method for objective pain interval\nestimation, incorporating uncertainty quantification. This work explores three\nalgorithms: the bootstrap method, lower and upper bound estimation (LossL)\noptimized by genetic algorithm, and modified lower and upper bound estimation\n(LossS) optimized by gradient descent algorithm. Our empirical results reveal\nthat LossS outperforms the other two by providing a narrower prediction\ninterval. As LossS outperforms, we assessed its performance in three different\nscenarios for pain assessment: (1) a generalized approach (single model for the\nentire population), (2) a personalized approach (separate model for each\nindividual), and (3) a hybrid approach (separate model for each cluster of\nindividuals). Our findings demonstrate the hybrid approach's superior\nperformance, with notable practicality in clinical contexts. It has the\npotential to be a valuable tool for clinicians, enabling objective pain\nintensity assessment while taking uncertainty into account. This capability is\ncrucial in facilitating effective pain management and reducing the risks\nassociated with improper treatment.\n","authors":["Burcu Ozek","Zhenyuan Lu","Srinivasan Radhakrishnan","Sagar Kamarthi"],"pdf_url":"https://arxiv.org/pdf/2311.08569v2.pdf","comment":"26 pages, 5 figures, 9 tables"},{"id":"http://arxiv.org/abs/2311.17609v1","updated":"2023-11-29T13:06:48Z","published":"2023-11-29T13:06:48Z","title":"AnyLens: A Generative Diffusion Model with Any Rendering Lens","summary":" State-of-the-art diffusion models can generate highly realistic images based\non various conditioning like text, segmentation, and depth. However, an\nessential aspect often overlooked is the specific camera geometry used during\nimage capture. The influence of different optical systems on the final scene\nappearance is frequently overlooked. This study introduces a framework that\nintimately integrates a text-to-image diffusion model with the particular lens\ngeometry used in image rendering. Our method is based on a per-pixel coordinate\nconditioning method, enabling the control over the rendering geometry. Notably,\nwe demonstrate the manipulation of curvature properties, achieving diverse\nvisual effects, such as fish-eye, panoramic views, and spherical texturing\nusing a single diffusion model.\n","authors":["Andrey Voynov","Amir Hertz","Moab Arar","Shlomi Fruchter","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2311.17609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17608v1","updated":"2023-11-29T13:05:20Z","published":"2023-11-29T13:05:20Z","title":"Adversarial Robust Memory-Based Continual Learner","summary":" Despite the remarkable advances that have been made in continual learning,\nthe adversarial vulnerability of such methods has not been fully discussed. We\ndelve into the adversarial robustness of memory-based continual learning\nalgorithms and observe limited robustness improvement by directly applying\nadversarial training techniques. Preliminary studies reveal the twin challenges\nfor building adversarial robust continual learners: accelerated forgetting in\ncontinual learning and gradient obfuscation in adversarial robustness. In this\nstudy, we put forward a novel adversarial robust memory-based continual learner\nthat adjusts data logits to mitigate the forgetting of pasts caused by\nadversarial samples. Furthermore, we devise a gradient-based data selection\nmechanism to overcome the gradient obfuscation caused by limited stored data.\nThe proposed approach can widely integrate with existing memory-based continual\nlearning as well as adversarial training algorithms in a plug-and-play way.\nExtensive experiments on Split-CIFAR10/100 and Split-Tiny-ImageNet demonstrate\nthe effectiveness of our approach, achieving up to 8.13% higher accuracy for\nadversarial data.\n","authors":["Xiaoyue Mi","Fan Tang","Zonghan Yang","Danding Wang","Juan Cao","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17607v1","updated":"2023-11-29T13:05:06Z","published":"2023-11-29T13:05:06Z","title":"Topology-Preserving Adversarial Training","summary":" Despite the effectiveness in improving the robustness of neural networks,\nadversarial training has suffered from the natural accuracy degradation\nproblem, i.e., accuracy on natural samples has reduced significantly. In this\nstudy, we reveal that natural accuracy degradation is highly related to the\ndisruption of the natural sample topology in the representation space by\nquantitative and qualitative experiments. Based on this observation, we propose\nTopology-pReserving Adversarial traINing (TRAIN) to alleviate the problem by\npreserving the topology structure of natural samples from a standard model\ntrained only on natural samples during adversarial training. As an additional\nregularization, our method can easily be combined with various popular\nadversarial training algorithms in a plug-and-play manner, taking advantage of\nboth sides. Extensive experiments on CIFAR-10, CIFAR-100, and Tiny ImageNet\nshow that our proposed method achieves consistent and significant improvements\nover various strong baselines in most cases. Specifically, without additional\ndata, our proposed method achieves up to 8.78% improvement in natural accuracy\nand 4.50% improvement in robust accuracy.\n","authors":["Xiaoyue Mi","Fan Tang","Yepeng Weng","Danding Wang","Juan Cao","Sheng Tang","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17601v1","updated":"2023-11-29T12:53:32Z","published":"2023-11-29T12:53:32Z","title":"Continual Learning with Low Rank Adaptation","summary":" Recent work using pretrained transformers has shown impressive performance\nwhen fine-tuned with data from the downstream problem of interest. However,\nthey struggle to retain that performance when the data characteristics changes.\nIn this paper, we focus on continual learning, where a pre-trained transformer\nis updated to perform well on new data, while retaining its performance on data\nit was previously trained on. Earlier works have tackled this primarily through\nmethods inspired from prompt tuning. We question this choice, and investigate\nthe applicability of Low Rank Adaptation (LoRA) to continual learning. On a\nrange of domain-incremental learning benchmarks, our LoRA-based solution,\nCoLoR, yields state-of-the-art performance, while still being as parameter\nefficient as the prompt tuning based methods.\n","authors":["Martin Wistuba","Prabhu Teja Sivaprasad","Lukas Balles","Giovanni Zappella"],"pdf_url":"https://arxiv.org/pdf/2311.17601v1.pdf","comment":"Accepted at Workshop on Distribution Shifts (DistShift), NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17598v1","updated":"2023-11-29T12:48:33Z","published":"2023-11-29T12:48:33Z","title":"Improving embedding of graphs with missing data by soft manifolds","summary":" Embedding graphs in continous spaces is a key factor in designing and\ndeveloping algorithms for automatic information extraction to be applied in\ndiverse tasks (e.g., learning, inferring, predicting). The reliability of graph\nembeddings directly depends on how much the geometry of the continuous space\nmatches the graph structure. Manifolds are mathematical structure that can\nenable to incorporate in their topological spaces the graph characteristics,\nand in particular nodes distances. State-of-the-art of manifold-based graph\nembedding algorithms take advantage of the assumption that the projection on a\ntangential space of each point in the manifold (corresponding to a node in the\ngraph) would locally resemble a Euclidean space. Although this condition helps\nin achieving efficient analytical solutions to the embedding problem, it does\nnot represent an adequate set-up to work with modern real life graphs, that are\ncharacterized by weighted connections across nodes often computed over sparse\ndatasets with missing records. In this work, we introduce a new class of\nmanifold, named soft manifold, that can solve this situation. In particular,\nsoft manifolds are mathematical structures with spherical symmetry where the\ntangent spaces to each point are hypocycloids whose shape is defined according\nto the velocity of information propagation across the data points. Using soft\nmanifolds for graph embedding, we can provide continuous spaces to pursue any\ntask in data analysis over complex datasets. Experimental results on\nreconstruction tasks on synthetic and real datasets show how the proposed\napproach enable more accurate and reliable characterization of graphs in\ncontinuous spaces with respect to the state-of-the-art.\n","authors":["Andrea Marinoni","Pietro Lio'","Alessandro Barp","Christian Jutten","Mark Girolami"],"pdf_url":"https://arxiv.org/pdf/2311.17598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17593v1","updated":"2023-11-29T12:41:55Z","published":"2023-11-29T12:41:55Z","title":"LanGWM: Language Grounded World Model","summary":" Recent advances in deep reinforcement learning have showcased its potential\nin tackling complex tasks. However, experiments on visual control tasks have\nrevealed that state-of-the-art reinforcement learning models struggle with\nout-of-distribution generalization. Conversely, expressing higher-level\nconcepts and global contexts is relatively easy using language.\n Building upon recent success of the large language models, our main objective\nis to improve the state abstraction technique in reinforcement learning by\nleveraging language for robust action selection. Specifically, we focus on\nlearning language-grounded visual features to enhance the world model learning,\na model-based reinforcement learning technique.\n To enforce our hypothesis explicitly, we mask out the bounding boxes of a few\nobjects in the image observation and provide the text prompt as descriptions\nfor these masked objects. Subsequently, we predict the masked objects along\nwith the surrounding regions as pixel reconstruction, similar to the\ntransformer-based masked autoencoder approach.\n Our proposed LanGWM: Language Grounded World Model achieves state-of-the-art\nperformance in out-of-distribution test at the 100K interaction steps\nbenchmarks of iGibson point navigation tasks. Furthermore, our proposed\ntechnique of explicit language-grounded visual representation learning has the\npotential to improve models for human-robot interaction because our extracted\nvisual features are language grounded.\n","authors":["Rudra P. K. Poudel","Harit Pandya","Chao Zhang","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2311.17593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17586v1","updated":"2023-11-29T12:29:54Z","published":"2023-11-29T12:29:54Z","title":"Federated Online and Bandit Convex Optimization","summary":" We study the problems of distributed online and bandit convex optimization\nagainst an adaptive adversary. We aim to minimize the average regret on $M$\nmachines working in parallel over $T$ rounds with $R$ intermittent\ncommunications. Assuming the underlying cost functions are convex and can be\ngenerated adaptively, our results show that collaboration is not beneficial\nwhen the machines have access to the first-order gradient information at the\nqueried points. This is in contrast to the case for stochastic functions, where\neach machine samples the cost functions from a fixed distribution. Furthermore,\nwe delve into the more challenging setting of federated online optimization\nwith bandit (zeroth-order) feedback, where the machines can only access values\nof the cost functions at the queried points. The key finding here is\nidentifying the high-dimensional regime where collaboration is beneficial and\nmay even lead to a linear speedup in the number of machines. We further\nillustrate our findings through federated adversarial linear bandits by\ndeveloping novel distributed single and two-point feedback algorithms. Our work\nis the first attempt towards a systematic understanding of federated online\noptimization with limited feedback, and it attains tight regret bounds in the\nintermittent communication setting for both first and zeroth-order feedback.\nOur results thus bridge the gap between stochastic and adaptive settings in\nfederated online optimization.\n","authors":["Kumar Kshitij Patel","Lingxiao Wang","Aadirupa Saha","Nati Sebro"],"pdf_url":"https://arxiv.org/pdf/2311.17586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17582v1","updated":"2023-11-29T12:18:46Z","published":"2023-11-29T12:18:46Z","title":"LoCoMotif: Discovering time-warped motifs in time series","summary":" Time Series Motif Discovery (TSMD) refers to the task of identifying patterns\nthat occur multiple times (possibly with minor variations) in a time series.\nAll existing methods for TSMD have one or more of the following limitations:\nthey only look for the two most similar occurrences of a pattern; they only\nlook for patterns of a pre-specified, fixed length; they cannot handle\nvariability along the time axis; and they only handle univariate time series.\nIn this paper, we present a new method, LoCoMotif, that has none of these\nlimitations. The method is motivated by a concrete use case from physiotherapy.\nWe demonstrate the value of the proposed method on this use case. We also\nintroduce a new quantitative evaluation metric for motif discovery, and\nbenchmark data for comparing TSMD methods. LoCoMotif substantially outperforms\nthe existing methods, on top of being more broadly applicable.\n","authors":["Daan Van Wesenbeeck","Aras Yurtman","Wannes Meert","Hendrik Blockeel"],"pdf_url":"https://arxiv.org/pdf/2311.17582v1.pdf","comment":"26 pages, 15 figures. Submitted to the journal track of the European\n Conference on Machine Learning and Principles and Practice of Knowledge\n Discovery in Databases (ECMLPKDD) 2024 in partnership with the Data Mining\n and Knowledge Discovery journal. Source code of the method is available at\n http://github.com/ML-KULeuven/locomotif"},{"id":"http://arxiv.org/abs/2310.00965v2","updated":"2023-11-29T12:13:35Z","published":"2023-10-02T08:12:51Z","title":"Effective Learning with Node Perturbation in Deep Neural Networks","summary":" Backpropagation (BP) is the dominant and most successful method for training\nparameters of deep neural network models. However, BP relies on two\ncomputationally distinct phases, does not provide a satisfactory explanation of\nbiological learning, and can be challenging to apply for training of networks\nwith discontinuities or noisy node dynamics. By comparison, node perturbation\n(NP) proposes learning by the injection of noise into the network activations,\nand subsequent measurement of the induced loss change. NP relies on two forward\n(inference) passes, does not make use of network derivatives, and has been\nproposed as a model for learning in biological systems. However, standard NP is\nhighly data inefficient and unstable due to its unguided noise-based search\nprocess. In this work, we investigate different formulations of NP and relate\nit to the concept of directional derivatives as well as combining it with a\ndecorrelating mechanism for layer-wise inputs. We find that a closer alignment\nwith directional derivatives together with input decorrelation at every layer\nsignificantly enhances performance of NP learning, making its performance on\nthe train set competitive with BP and allowing its application to noisy systems\nin which the noise process itself is inaccessible.\n","authors":["Sander Dalm","Marcel van Gerven","Nasir Ahmad"],"pdf_url":"https://arxiv.org/pdf/2310.00965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17565v1","updated":"2023-11-29T11:59:03Z","published":"2023-11-29T11:59:03Z","title":"Bias Resilient Multi-Step Off-Policy Goal-Conditioned Reinforcement\n Learning","summary":" In goal-conditioned reinforcement learning (GCRL), sparse rewards present\nsignificant challenges, often obstructing efficient learning. Although\nmulti-step GCRL can boost this efficiency, it can also lead to off-policy\nbiases in target values. This paper dives deep into these biases, categorizing\nthem into two distinct categories: \"shooting\" and \"shifting\". Recognizing that\ncertain behavior policies can hasten policy refinement, we present solutions\ndesigned to capitalize on the positive aspects of these biases while minimizing\ntheir drawbacks, enabling the use of larger step sizes to speed up GCRL. An\nempirical study demonstrates that our approach ensures a resilient and robust\nimprovement, even in ten-step learning scenarios, leading to superior learning\nefficiency and performance that generally surpass the baseline and several\nstate-of-the-art multi-step GCRL benchmarks.\n","authors":["Lisheng Wu","Ke Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17565v1.pdf","comment":"26 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.17560v1","updated":"2023-11-29T11:48:16Z","published":"2023-11-29T11:48:16Z","title":"Interpreting Differentiable Latent States for Healthcare Time-series\n Data","summary":" Machine learning enables extracting clinical insights from large temporal\ndatasets. The applications of such machine learning models include identifying\ndisease patterns and predicting patient outcomes. However, limited\ninterpretability poses challenges for deploying advanced machine learning in\ndigital healthcare. Understanding the meaning of latent states is crucial for\ninterpreting machine learning models, assuming they capture underlying\npatterns. In this paper, we present a concise algorithm that allows for i)\ninterpreting latent states using highly related input features; ii)\ninterpreting predictions using subsets of input features via latent states; and\niii) interpreting changes in latent states over time. The proposed algorithm is\nfeasible for any model that is differentiable. We demonstrate that this\napproach enables the identification of a daytime behavioral pattern for\npredicting nocturnal behavior in a real-world healthcare dataset.\n","authors":["Yu Chen","Nivedita Bijlani","Samaneh Kouchaki","Payam Barnaghi"],"pdf_url":"https://arxiv.org/pdf/2311.17560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17552v1","updated":"2023-11-29T11:35:54Z","published":"2023-11-29T11:35:54Z","title":"An Efficient Illumination Invariant Tiger Detection Framework for\n Wildlife Surveillance","summary":" Tiger conservation necessitates the strategic deployment of multifaceted\ninitiatives encompassing the preservation of ecological habitats, anti-poaching\nmeasures, and community involvement for sustainable growth in the tiger\npopulation. With the advent of artificial intelligence, tiger surveillance can\nbe automated using object detection. In this paper, an accurate illumination\ninvariant framework is proposed based on EnlightenGAN and YOLOv8 for tiger\ndetection. The fine-tuned YOLOv8 model achieves a mAP score of 61% without\nillumination enhancement. The illumination enhancement improves the mAP by\n0.7%. The approaches elevate the state-of-the-art performance on the ATRW\ndataset by approximately 6% to 7%.\n","authors":["Gaurav Pendharkar","A. Ancy Micheal","Jason Misquitta","Ranjeesh Kaippada"],"pdf_url":"https://arxiv.org/pdf/2311.17552v1.pdf","comment":"accepted at ICCIS 2023"},{"id":"http://arxiv.org/abs/2311.17539v1","updated":"2023-11-29T11:19:50Z","published":"2023-11-29T11:19:50Z","title":"The Effects of Overparameterization on Sharpness-aware Minimization: An\n Empirical and Theoretical Analysis","summary":" Training an overparameterized neural network can yield minimizers of the same\nlevel of training loss and yet different generalization capabilities. With\nevidence that indicates a correlation between sharpness of minima and their\ngeneralization errors, increasing efforts have been made to develop an\noptimization method to explicitly find flat minima as more generalizable\nsolutions. This sharpness-aware minimization (SAM) strategy, however, has not\nbeen studied much yet as to how overparameterization can actually affect its\nbehavior. In this work, we analyze SAM under varying degrees of\noverparameterization and present both empirical and theoretical results that\nsuggest a critical influence of overparameterization on SAM. Specifically, we\nfirst use standard techniques in optimization to prove that SAM can achieve a\nlinear convergence rate under overparameterization in a stochastic setting. We\nalso show that the linearly stable minima found by SAM are indeed flatter and\nhave more uniformly distributed Hessian moments compared to those of SGD. These\nresults are corroborated with our experiments that reveal a consistent trend\nthat the generalization improvement made by SAM continues to increase as the\nmodel becomes more overparameterized. We further present that sparsity can open\nup an avenue for effective overparameterization in practice.\n","authors":["Sungbin Shin","Dongyeop Lee","Maksym Andriushchenko","Namhoon Lee"],"pdf_url":"https://arxiv.org/pdf/2311.17539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18381v3","updated":"2023-11-29T10:46:19Z","published":"2023-05-28T06:53:41Z","title":"Distill Gold from Massive Ores: Efficient Dataset Distillation via\n Critical Samples Selection","summary":" Data-efficient learning has garnered significant attention, especially given\nthe current trend of large multi-modal models. Recently, dataset distillation\nbecomes an effective approach for data-efficiency; however, the distillation\nprocess itself can still be inefficient. In this work, we model the dataset\ndistillation task within the context of information transport. By observing the\nsubstantial data redundancy inherent in the distillation, we argue to put more\nemphasis on the samples' utility for the distillation task. We introduce and\nvalidate a family of data utility estimators and optimal data selection methods\nto exploit the most valuable samples. This strategy significantly reduces the\ntraining costs and extends various existing distillation algorithms to larger\nand more diversified datasets, e.g., in some cases only 0.04% training data is\nsufficient for comparable distillation performance. Our method consistently\nenhances the distillation algorithms, even on much larger-scale and more\nheterogeneous datasets, e.g. ImageNet-1K and Kinetics-400. This paradigm opens\nup new avenues in the dynamics of distillation and paves the way for efficient\ndataset distillation. Our code is available on\nhttps://github.com/silicx/GoldFromOres .\n","authors":["Yue Xu","Yong-Lu Li","Kaitong Cui","Ziyu Wang","Cewu Lu","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2305.18381v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10642v3","updated":"2023-11-29T10:41:36Z","published":"2023-11-17T16:58:52Z","title":"Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as\n an Alternative to Attention Layers in Transformers","summary":" This work presents an analysis of the effectiveness of using standard shallow\nfeed-forward networks to mimic the behavior of the attention mechanism in the\noriginal Transformer model, a state-of-the-art architecture for\nsequence-to-sequence tasks. We substitute key elements of the attention\nmechanism in the Transformer with simple feed-forward networks, trained using\nthe original components via knowledge distillation. Our experiments, conducted\non the IWSLT2017 dataset, reveal the capacity of these \"attentionless\nTransformers\" to rival the performance of the original architecture. Through\nrigorous ablation studies, and experimenting with various replacement network\ntypes and sizes, we offer insights that support the viability of our approach.\nThis not only sheds light on the adaptability of shallow feed-forward networks\nin emulating attention mechanisms but also underscores their potential to\nstreamline complex architectures for sequence-to-sequence tasks.\n","authors":["Vukasin Bozic","Danilo Dordevic","Daniele Coppola","Joseph Thommes","Sidak Pal Singh"],"pdf_url":"https://arxiv.org/pdf/2311.10642v3.pdf","comment":"Accepted at AAAI24(https://aaai.org/aaai-conference/)"},{"id":"http://arxiv.org/abs/2311.17518v1","updated":"2023-11-29T10:40:52Z","published":"2023-11-29T10:40:52Z","title":"The devil is in the fine-grained details: Evaluating open-vocabulary\n object detectors for fine-grained understanding","summary":" Recent advancements in large vision-language models enabled visual object\ndetection in open-vocabulary scenarios, where object classes are defined in\nfree-text formats during inference. In this paper, we aim to probe the\nstate-of-the-art methods for open-vocabulary object detection to determine to\nwhat extent they understand fine-grained properties of objects and their parts.\nTo this end, we introduce an evaluation protocol based on dynamic vocabulary\ngeneration to test whether models detect, discern, and assign the correct\nfine-grained description to objects in the presence of hard-negative classes.\nWe contribute with a benchmark suite of increasing difficulty and probing\ndifferent properties like color, pattern, and material. We further enhance our\ninvestigation by evaluating several state-of-the-art open-vocabulary object\ndetectors using the proposed protocol and find that most existing solutions,\nwhich shine in standard open-vocabulary benchmarks, struggle to accurately\ncapture and distinguish finer object details. We conclude the paper by\nhighlighting the limitations of current methodologies and exploring promising\nresearch directions to overcome the discovered drawbacks. Data and code are\navailable at https://github.com/lorebianchi98/FG-OVD.\n","authors":["Lorenzo Bianchi","Fabio Carrara","Nicola Messina","Claudio Gennaro","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2311.17518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14912v3","updated":"2023-11-29T10:34:36Z","published":"2023-05-24T09:02:01Z","title":"SVDinsTN: A Tensor Network Paradigm for Efficient Structure Search from\n Regularized Modeling Perspective","summary":" Tensor network (TN) representation is a powerful technique for computer\nvision and machine learning. TN structure search (TN-SS) aims to search for a\ncustomized structure to achieve a compact representation, which is a\nchallenging NP-hard problem. Recent \"sampling-evaluation-based\" methods require\nsampling an extensive collection of structures and evaluating them one by one,\nresulting in prohibitively high computational costs. To address this issue, we\npropose a novel TN paradigm, named SVD-inspired TN decomposition (SVDinsTN),\nwhich allows us to efficiently solve the TN-SS problem from a regularized\nmodeling perspective, eliminating the repeated structure evaluations. To be\nspecific, by inserting a diagonal factor for each edge of the fully-connected\nTN, SVDinsTN allows us to calculate TN cores and diagonal factors\nsimultaneously, with the factor sparsity revealing a compact TN structure. In\ntheory, we prove a convergence guarantee for the proposed method. Experimental\nresults demonstrate that the proposed method achieves approximately 100 to 1000\ntimes acceleration compared to the state-of-the-art TN-SS methods while\nmaintaining a comparable representation ability.\n","authors":["Yu-Bang Zheng","Xi-Le Zhao","Junhua Zeng","Chao Li","Qibin Zhao","Heng-Chao Li","Ting-Zhu Huang"],"pdf_url":"https://arxiv.org/pdf/2305.14912v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17508v1","updated":"2023-11-29T10:32:40Z","published":"2023-11-29T10:32:40Z","title":"Model Performance Prediction for Hyperparameter Optimization of Deep\n Learning Models Using High Performance Computing and Quantum Annealing","summary":" Hyperparameter Optimization (HPO) of Deep Learning-based models tends to be a\ncompute resource intensive process as it usually requires to train the target\nmodel with many different hyperparameter configurations. We show that\nintegrating model performance prediction with early stopping methods holds\ngreat potential to speed up the HPO process of deep learning models. Moreover,\nwe propose a novel algorithm called Swift-Hyperband that can use either\nclassical or quantum support vector regression for performance prediction and\nbenefit from distributed High Performance Computing environments. This\nalgorithm is tested not only for the Machine-Learned Particle Flow model used\nin High Energy Physics, but also for a wider range of target models from\ndomains such as computer vision and natural language processing.\nSwift-Hyperband is shown to find comparable (or better) hyperparameters as well\nas using less computational resources in all test cases.\n","authors":["Juan Pablo García Amboage","Eric Wulff","Maria Girone","Tomás F. Pena"],"pdf_url":"https://arxiv.org/pdf/2311.17508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14545v2","updated":"2023-11-29T10:20:19Z","published":"2023-02-28T13:10:04Z","title":"Modern Bayesian Experimental Design","summary":" Bayesian experimental design (BED) provides a powerful and general framework\nfor optimizing the design of experiments. However, its deployment often poses\nsubstantial computational challenges that can undermine its practical use. In\nthis review, we outline how recent advances have transformed our ability to\novercome these challenges and thus utilize BED effectively, before discussing\nsome key areas for future development in the field.\n","authors":["Tom Rainforth","Adam Foster","Desi R Ivanova","Freddie Bickford Smith"],"pdf_url":"https://arxiv.org/pdf/2302.14545v2.pdf","comment":"Accepted for publication in Statistical Science"},{"id":"http://arxiv.org/abs/2203.00948v4","updated":"2023-11-29T10:17:09Z","published":"2022-03-02T08:58:06Z","title":"CD-GAN: a robust fusion-based generative adversarial network for\n unsupervised remote sensing change detection with heterogeneous sensors","summary":" In the context of Earth observation, change detection boils down to comparing\nimages acquired at different times by sensors of possibly different spatial\nand/or spectral resolutions or different modalities (e.g., optical or radar).\nEven when considering only optical images, this task has proven to be\nchallenging as soon as the sensors differ by their spatial and/or spectral\nresolutions. This paper proposes a novel unsupervised change detection method\ndedicated to images acquired by such so-called heterogeneous optical sensors.\nIt capitalizes on recent advances which formulate the change detection task\ninto a robust fusion framework. Adopting this formulation, the work reported in\nthis paper shows that any off-the-shelf network trained beforehand to fuse\noptical images of different spatial and/or spectral resolutions can be easily\ncomplemented with a network of the same architecture and embedded into an\nadversarial framework to perform change detection. A comparison with\nstate-of-the-art change detection methods demonstrates the versatility and the\neffectiveness of the proposed approach.\n","authors":["Jin-Ju Wang","Nicolas Dobigeon","Marie Chabert","Ding-Cheng Wang","Ting-Zhu Huang","Jie Huang"],"pdf_url":"https://arxiv.org/pdf/2203.00948v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.05368v3","updated":"2023-11-29T10:11:23Z","published":"2022-11-10T06:22:12Z","title":"A Comprehensive Survey on Distributed Training of Graph Neural Networks","summary":" Graph neural networks (GNNs) have been demonstrated to be a powerful\nalgorithmic model in broad application fields for their effectiveness in\nlearning over graphs. To scale GNN training up for large-scale and ever-growing\ngraphs, the most promising solution is distributed training which distributes\nthe workload of training across multiple computing nodes. At present, the\nvolume of related research on distributed GNN training is exceptionally vast,\naccompanied by an extraordinarily rapid pace of publication. Moreover, the\napproaches reported in these studies exhibit significant divergence. This\nsituation poses a considerable challenge for newcomers, hindering their ability\nto grasp a comprehensive understanding of the workflows, computational\npatterns, communication strategies, and optimization techniques employed in\ndistributed GNN training. As a result, there is a pressing need for a survey to\nprovide correct recognition, analysis, and comparisons in this field. In this\npaper, we provide a comprehensive survey of distributed GNN training by\ninvestigating various optimization techniques used in distributed GNN training.\nFirst, distributed GNN training is classified into several categories according\nto their workflows. In addition, their computational patterns and communication\npatterns, as well as the optimization techniques proposed by recent work are\nintroduced. Second, the software frameworks and hardware platforms of\ndistributed GNN training are also introduced for a deeper understanding. Third,\ndistributed GNN training is compared with distributed training of deep neural\nnetworks, emphasizing the uniqueness of distributed GNN training. Finally,\ninteresting issues and opportunities in this field are discussed.\n","authors":["Haiyang Lin","Mingyu Yan","Xiaochun Ye","Dongrui Fan","Shirui Pan","Wenguang Chen","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2211.05368v3.pdf","comment":"To Appear in Proceedings of the IEEE"},{"id":"http://arxiv.org/abs/2301.00545v4","updated":"2023-11-29T10:10:04Z","published":"2023-01-02T07:13:28Z","title":"Knockoffs-SPR: Clean Sample Selection in Learning with Noisy Labels","summary":" A noisy training set usually leads to the degradation of the generalization\nand robustness of neural networks. In this paper, we propose a novel\ntheoretically guaranteed clean sample selection framework for learning with\nnoisy labels. Specifically, we first present a Scalable Penalized Regression\n(SPR) method, to model the linear relation between network features and one-hot\nlabels. In SPR, the clean data are identified by the zero mean-shift parameters\nsolved in the regression model. We theoretically show that SPR can recover\nclean data under some conditions. Under general scenarios, the conditions may\nbe no longer satisfied; and some noisy data are falsely selected as clean data.\nTo solve this problem, we propose a data-adaptive method for Scalable Penalized\nRegression with Knockoff filters (Knockoffs-SPR), which is provable to control\nthe False-Selection-Rate (FSR) in the selected clean data. To improve the\nefficiency, we further present a split algorithm that divides the whole\ntraining set into small pieces that can be solved in parallel to make the\nframework scalable to large datasets. While Knockoffs-SPR can be regarded as a\nsample selection module for a standard supervised training pipeline, we further\ncombine it with a semi-supervised algorithm to exploit the support of noisy\ndata as unlabeled data. Experimental results on several benchmark datasets and\nreal-world noisy datasets show the effectiveness of our framework and validate\nthe theoretical results of Knockoffs-SPR. Our code and pre-trained models are\navailable at https://github.com/Yikai-Wang/Knockoffs-SPR.\n","authors":["Yikai Wang","Yanwei Fu","Xinwei Sun"],"pdf_url":"https://arxiv.org/pdf/2301.00545v4.pdf","comment":"update: final version, to appear in TPAMI"},{"id":"http://arxiv.org/abs/2110.12403v3","updated":"2023-11-29T10:01:05Z","published":"2021-10-24T10:23:51Z","title":"Learning to Estimate Without Bias","summary":" The Gauss Markov theorem states that the weighted least squares estimator is\na linear minimum variance unbiased estimation (MVUE) in linear models. In this\npaper, we take a first step towards extending this result to non linear\nsettings via deep learning with bias constraints. The classical approach to\ndesigning non-linear MVUEs is through maximum likelihood estimation (MLE) which\noften involves computationally challenging optimizations. On the other hand,\ndeep learning methods allow for non-linear estimators with fixed computational\ncomplexity. Learning based estimators perform optimally on average with respect\nto their training set but may suffer from significant bias in other parameters.\nTo avoid this, we propose to add a simple bias constraint to the loss function,\nresulting in an estimator we refer to as Bias Constrained Estimator (BCE). We\nprove that this yields asymptotic MVUEs that behave similarly to the classical\nMLEs and asymptotically attain the Cramer Rao bound. We demonstrate the\nadvantages of our approach in the context of signal to noise ratio estimation\nas well as covariance estimation. A second motivation to BCE is in applications\nwhere multiple estimates of the same unknown are averaged for improved\nperformance. Examples include distributed sensor networks and data augmentation\nin test-time. In such applications, we show that BCE leads to asymptotically\nconsistent estimators.\n","authors":["Tzvi Diskin","Yonina C. Eldar","Ami Wiesel"],"pdf_url":"https://arxiv.org/pdf/2110.12403v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08972v2","updated":"2023-11-29T09:57:06Z","published":"2023-11-15T14:04:37Z","title":"Unsupervised approaches based on optimal transport and convex analysis\n for inverse problems in imaging","summary":" Unsupervised deep learning approaches have recently become one of the crucial\nresearch areas in imaging owing to their ability to learn expressive and\npowerful reconstruction operators even when paired high-quality training data\nis scarcely available. In this chapter, we review theoretically principled\nunsupervised learning schemes for solving imaging inverse problems, with a\nparticular focus on methods rooted in optimal transport and convex analysis. We\nbegin by reviewing the optimal transport-based unsupervised approaches such as\nthe cycle-consistency-based models and learned adversarial regularization\nmethods, which have clear probabilistic interpretations. Subsequently, we give\nan overview of a recent line of works on provably convergent learned\noptimization algorithms applied to accelerate the solution of imaging inverse\nproblems, alongside their dedicated unsupervised training schemes. We also\nsurvey a number of provably convergent plug-and-play algorithms (based on\ngradient-step deep denoisers), which are among the most important and widely\napplied unsupervised approaches for imaging problems. At the end of this\nsurvey, we provide an overview of a few related unsupervised learning\nframeworks that complement our focused schemes. Together with a detailed\nsurvey, we provide an overview of the key mathematical results that underlie\nthe methods reviewed in the chapter to keep our discussion self-contained.\n","authors":["Marcello Carioni","Subhadip Mukherjee","Hong Ye Tan","Junqi Tang"],"pdf_url":"https://arxiv.org/pdf/2311.08972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17466v1","updated":"2023-11-29T09:18:39Z","published":"2023-11-29T09:18:39Z","title":"Slot-Mixup with Subsampling: A Simple Regularization for WSI\n Classification","summary":" Whole slide image (WSI) classification requires repetitive zoom-in and out\nfor pathologists, as only small portions of the slide may be relevant to\ndetecting cancer. Due to the lack of patch-level labels, multiple instance\nlearning (MIL) is a common practice for training a WSI classifier. One of the\nchallenges in MIL for WSIs is the weak supervision coming only from the\nslide-level labels, often resulting in severe overfitting. In response,\nresearchers have considered adopting patch-level augmentation or applying mixup\naugmentation, but their applicability remains unverified. Our approach augments\nthe training dataset by sampling a subset of patches in the WSI without\nsignificantly altering the underlying semantics of the original slides.\nAdditionally, we introduce an efficient model (Slot-MIL) that organizes patches\ninto a fixed number of slots, the abstract representation of patches, using an\nattention mechanism. We empirically demonstrate that the subsampling\naugmentation helps to make more informative slots by restricting the\nover-concentration of attention and to improve interpretability. Finally, we\nillustrate that combining our attention-based aggregation model with\nsubsampling and mixup, which has shown limited compatibility in existing MIL\nmethods, can enhance both generalization and calibration. Our proposed methods\nachieve the state-of-the-art performance across various benchmark datasets\nincluding class imbalance and distribution shifts.\n","authors":["Seongho Keum","Sanghyun Kim","Soojeong Lee","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2311.17466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07598v4","updated":"2023-11-29T08:56:29Z","published":"2023-05-12T16:42:54Z","title":"Hausdorff Distance Matching with Adaptive Query Denoising for Rotated\n Detection Transformer","summary":" The Detection Transformer (DETR) has emerged as a pivotal role in object\ndetection tasks, setting new performance benchmarks due to its end-to-end\ndesign and scalability. Despite its advancements, the application of DETR in\ndetecting rotated objects has demonstrated suboptimal performance relative to\nestablished oriented object detectors. Our analysis identifies a key\nlimitation: the L1 cost used in Hungarian Matching leads to duplicate\npredictions due to the square-like problem in oriented object detection,\nthereby obstructing the training process of the detector. We introduce a\nHausdorff distance-based cost for Hungarian matching, which more accurately\nquantifies the discrepancy between predictions and ground truths. Moreover, we\nnote that a static denoising approach hampers the training of rotated DETR,\nparticularly when the detector's predictions surpass the quality of noised\nground truths. We propose an adaptive query denoising technique, employing\nHungarian matching to selectively filter out superfluous noised queries that no\nlonger contribute to model improvement. Our proposed modifications to DETR have\nresulted in superior performance, surpassing previous rotated DETR models and\nother alternatives. This is evidenced by our model's state-of-the-art\nachievements in benchmarks such as DOTA-v1.0/v1.5/v2.0, and DIOR-R.\n","authors":["Hakjin Lee","Minki Song","Jamyoung Koo","Junghoon Seo"],"pdf_url":"https://arxiv.org/pdf/2305.07598v4.pdf","comment":"Under review, 16 pages, 12 tables, 8 figures"},{"id":"http://arxiv.org/abs/2311.17451v1","updated":"2023-11-29T08:48:26Z","published":"2023-11-29T08:48:26Z","title":"Wireless Network Digital Twin for 6G: Generative AI as A Key Enabler","summary":" Digital twin, which enables emulation, evaluation, and optimization of\nphysical entities through synchronized digital replicas, has gained\nincreasingly attention as a promising technology for intricate wireless\nnetworks. For 6G, numerous innovative wireless technologies and network\narchitectures have posed new challenges in establishing wireless network\ndigital twins. To tackle these challenges, artificial intelligence (AI),\nparticularly the flourishing generative AI, emerges as a potential solution. In\nthis article, we discuss emerging prerequisites for wireless network digital\ntwins considering the complicated network architecture, tremendous network\nscale, extensive coverage, and diversified application scenarios in the 6G era.\nWe further explore the applications of generative AI, such as transformer and\ndiffusion model, to empower the 6G digital twin from multiple perspectives\nincluding implementation, physical-digital synchronization, and slicing\ncapability. Subsequently, we propose a hierarchical generative AI-enabled\nwireless network digital twin at both the message-level and policy-level, and\nprovide a typical use case with numerical results to validate the effectiveness\nand efficiency. Finally, open research issues for wireless network digital\ntwins in the 6G era are discussed.\n","authors":["Zhenyu Tao","Wei Xu","Yongming Huang","Xiaoyun Wang","Xiaohu You"],"pdf_url":"https://arxiv.org/pdf/2311.17451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14056v2","updated":"2023-11-29T08:43:45Z","published":"2023-11-23T15:19:30Z","title":"DPSUR: Accelerating Differentially Private Stochastic Gradient Descent\n Using Selective Update and Release","summary":" Machine learning models are known to memorize private data to reduce their\ntraining loss, which can be inadvertently exploited by privacy attacks such as\nmodel inversion and membership inference. To protect against these attacks,\ndifferential privacy (DP) has become the de facto standard for\nprivacy-preserving machine learning, particularly those popular training\nalgorithms using stochastic gradient descent, such as DPSGD. Nonetheless, DPSGD\nstill suffers from severe utility loss due to its slow convergence. This is\npartially caused by the random sampling, which brings bias and variance to the\ngradient, and partially by the Gaussian noise, which leads to fluctuation of\ngradient updates.\n Our key idea to address these issues is to apply selective updates to the\nmodel training, while discarding those useless or even harmful updates.\nMotivated by this, this paper proposes DPSUR, a Differentially Private training\nframework based on Selective Updates and Release, where the gradient from each\niteration is evaluated based on a validation test, and only those updates\nleading to convergence are applied to the model. As such, DPSUR ensures the\ntraining in the right direction and thus can achieve faster convergence than\nDPSGD. The main challenges lie in two aspects -- privacy concerns arising from\ngradient evaluation, and gradient selection strategy for model update. To\naddress the challenges, DPSUR introduces a clipping strategy for update\nrandomization and a threshold mechanism for gradient selection. Experiments\nconducted on MNIST, FMNIST, CIFAR-10, and IMDB datasets show that DPSUR\nsignificantly outperforms previous works in terms of convergence speed and\nmodel utility.\n","authors":["Jie Fu","Qingqing Ye","Haibo Hu","Zhili Chen","Lulu Wang","Kuncan Wang","Xun Ran"],"pdf_url":"https://arxiv.org/pdf/2311.14056v2.pdf","comment":"This paper has been accepted by VLDB 2024"},{"id":"http://arxiv.org/abs/2311.17446v1","updated":"2023-11-29T08:40:46Z","published":"2023-11-29T08:40:46Z","title":"Uncertainty in Additive Feature Attribution methods","summary":" In this work, we explore various topics that fall under the umbrella of\nUncertainty in post-hoc Explainable AI (XAI) methods. We in particular focus on\nthe class of additive feature attribution explanation methods. We first\ndescribe our specifications of uncertainty and compare various statistical and\nrecent methods to quantify the same. Next, for a particular instance, we study\nthe relationship between a feature's attribution and its uncertainty and\nobserve little correlation. As a result, we propose a modification in the\ndistribution from which perturbations are sampled in LIME-based algorithms such\nthat the important features have minimal uncertainty without an increase in\ncomputational cost. Next, while studying how the uncertainty in explanations\nvaries across the feature space of a classifier, we observe that a fraction of\ninstances show near-zero uncertainty. We coin the term \"stable instances\" for\nsuch instances and diagnose factors that make an instance stable. Next, we\nstudy how an XAI algorithm's uncertainty varies with the size and complexity of\nthe underlying model. We observe that the more complex the model, the more\ninherent uncertainty is exhibited by it. As a result, we propose a measure to\nquantify the relative complexity of a blackbox classifier. This could be\nincorporated, for example, in LIME-based algorithms' sampling densities, to\nhelp different explanation algorithms achieve tighter confidence levels.\nTogether, the above measures would have a strong impact on making XAI models\nrelatively trustworthy for the end-user as well as aiding scientific discovery.\n","authors":["Abhishek Madaan","Tanya Chowdhury","Neha Rana","James Allan","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2311.17446v1.pdf","comment":"14"},{"id":"http://arxiv.org/abs/2311.17434v1","updated":"2023-11-29T08:26:18Z","published":"2023-11-29T08:26:18Z","title":"Group-wise Sparse and Explainable Adversarial Attacks","summary":" Sparse adversarial attacks fool deep neural networks (DNNs) through minimal\npixel perturbations, typically regularized by the $\\ell_0$ norm. Recent efforts\nhave replaced this norm with a structural sparsity regularizer, such as the\nnuclear group norm, to craft group-wise sparse adversarial attacks. The\nresulting perturbations are thus explainable and hold significant practical\nrelevance, shedding light on an even greater vulnerability of DNNs than\npreviously anticipated. However, crafting such attacks poses an optimization\nchallenge, as it involves computing norms for groups of pixels within a\nnon-convex objective. In this paper, we tackle this challenge by presenting an\nalgorithm that simultaneously generates group-wise sparse attacks within\nsemantically meaningful areas of an image. In each iteration, the core\noperation of our algorithm involves the optimization of a quasinorm adversarial\nloss. This optimization is achieved by employing the $1/2$-quasinorm proximal\noperator for some iterations, a method tailored for nonconvex programming.\nSubsequently, the algorithm transitions to a projected Nesterov's accelerated\ngradient descent with $2$-norm regularization applied to perturbation\nmagnitudes. We rigorously evaluate the efficacy of our novel attack in both\ntargeted and non-targeted attack scenarios, on CIFAR-10 and ImageNet datasets.\nWhen compared to state-of-the-art methods, our attack consistently results in a\nremarkable increase in group-wise sparsity, e.g., an increase of $48.12\\%$ on\nCIFAR-10 and $40.78\\%$ on ImageNet (average case, targeted attack), all while\nmaintaining lower perturbation magnitudes. Notably, this performance is\ncomplemented by a significantly faster computation time and a $100\\%$ attack\nsuccess rate.\n","authors":["Shpresim Sadiku","Moritz Wagner","Sebastian Pokutta"],"pdf_url":"https://arxiv.org/pdf/2311.17434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17431v1","updated":"2023-11-29T08:21:42Z","published":"2023-11-29T08:21:42Z","title":"Grounding Foundation Models through Federated Transfer Learning: A\n General Framework","summary":" Foundation Models (FMs) such as GPT-4 encoded with vast knowledge and\npowerful emergent abilities have achieved remarkable success in various natural\nlanguage processing and computer vision tasks. Grounding FMs by adapting them\nto domain-specific tasks or augmenting them with domain-specific knowledge\nenables us to exploit the full potential of FMs. However, grounding FMs faces\nseveral challenges, stemming primarily from constrained computing resources,\ndata privacy, model heterogeneity, and model ownership. Federated Transfer\nLearning (FTL), the combination of federated learning and transfer learning,\nprovides promising solutions to address these challenges. In recent years, the\nneed for grounding FMs leveraging FTL, coined FTL-FM, has arisen strongly in\nboth academia and industry. Motivated by the strong growth in FTL-FM research\nand the potential impact of FTL-FM on industrial applications, we propose an\nFTL-FM framework that formulates problems of grounding FMs in the federated\nlearning setting, construct a detailed taxonomy based on the FTL-FM framework\nto categorize state-of-the-art FTL-FM works, and comprehensively overview\nFTL-FM works based on the proposed taxonomy. We also establish correspondences\nbetween FTL-FM and conventional phases of adapting FM so that FM practitioners\ncan align their research works with FTL-FM. In addition, we overview advanced\nefficiency-improving and privacy-preserving techniques because efficiency and\nprivacy are critical concerns in FTL-FM. Last, we discuss opportunities and\nfuture research directions of FTL-FM.\n","authors":["Yan Kang","Tao Fan","Hanlin Gu","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.17431v1.pdf","comment":"in progress"},{"id":"http://arxiv.org/abs/2303.10426v2","updated":"2023-11-29T07:44:00Z","published":"2023-03-18T14:37:37Z","title":"Discovering Predictable Latent Factors for Time Series Forecasting","summary":" Modern time series forecasting methods, such as Transformer and its variants,\nhave shown strong ability in sequential data modeling. To achieve high\nperformance, they usually rely on redundant or unexplainable structures to\nmodel complex relations between variables and tune the parameters with\nlarge-scale data. Many real-world data mining tasks, however, lack sufficient\nvariables for relation reasoning, and therefore these methods may not properly\nhandle such forecasting problems. With insufficient data, time series appear to\nbe affected by many exogenous variables, and thus, the modeling becomes\nunstable and unpredictable. To tackle this critical issue, in this paper, we\ndevelop a novel algorithmic framework for inferring the intrinsic latent\nfactors implied by the observable time series. The inferred factors are used to\nform multiple independent and predictable signal components that enable not\nonly sparse relation reasoning for long-term efficiency but also reconstructing\nthe future temporal data for accurate prediction. To achieve this, we introduce\nthree characteristics, i.e., predictability, sufficiency, and identifiability,\nand model these characteristics via the powerful deep latent dynamics models to\ninfer the predictable signal components. Empirical results on multiple real\ndatasets show the efficiency of our method for different kinds of time series\nforecasting. The statistical analysis validates the predictability of the\nlearned latent factors.\n","authors":["Jingyi Hou","Zhen Dong","Jiayu Zhou","Zhijie Liu"],"pdf_url":"https://arxiv.org/pdf/2303.10426v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16614v2","updated":"2023-11-29T07:34:52Z","published":"2023-11-28T09:11:02Z","title":"A Multivariate Unimodality Test Harnenssing the Dip Statistic of\n Mahalanobis Distances Over Random Projections","summary":" Unimodality, pivotal in statistical analysis, offers insights into dataset\nstructures and drives sophisticated analytical procedures. While unimodality's\nconfirmation is straightforward for one-dimensional data using methods like\nSilverman's approach and Hartigans' dip statistic, its generalization to higher\ndimensions remains challenging. By extrapolating one-dimensional unimodality\nprinciples to multi-dimensional spaces through linear random projections and\nleveraging point-to-point distancing, our method, rooted in\n$\\alpha$-unimodality assumptions, presents a novel multivariate unimodality\ntest named mud-pod. Both theoretical and empirical studies confirm the efficacy\nof our method in unimodality assessment of multidimensional datasets as well as\nin estimating the number of clusters.\n","authors":["Prodromos Kolyvakis","Aristidis Likas"],"pdf_url":"https://arxiv.org/pdf/2311.16614v2.pdf","comment":"11 pages, 1 figure"},{"id":"http://arxiv.org/abs/2311.17410v1","updated":"2023-11-29T07:30:32Z","published":"2023-11-29T07:30:32Z","title":"GNNFlow: A Distributed Framework for Continuous Temporal GNN Learning on\n Dynamic Graphs","summary":" Graph Neural Networks (GNNs) play a crucial role in various fields. However,\nmost existing deep graph learning frameworks assume pre-stored static graphs\nand do not support training on graph streams. In contrast, many real-world\ngraphs are dynamic and contain time domain information. We introduce GNNFlow, a\ndistributed framework that enables efficient continuous temporal graph\nrepresentation learning on dynamic graphs on multi-GPU machines. GNNFlow\nintroduces an adaptive time-indexed block-based data structure that effectively\nbalances memory usage with graph update and sampling operation efficiency. It\nfeatures a hybrid GPU-CPU graph data placement for rapid GPU-based temporal\nneighborhood sampling and kernel optimizations for enhanced sampling processes.\nA dynamic GPU cache for node and edge features is developed to maximize cache\nhit rates through reuse and restoration strategies. GNNFlow supports\ndistributed training across multiple machines with static scheduling to ensure\nload balance. We implement GNNFlow based on DGL and PyTorch. Our experimental\nresults show that GNNFlow provides up to 21.1x faster continuous learning than\nexisting systems.\n","authors":["Yuchen Zhong","Guangming Sheng","Tianzuo Qin","Minjie Wang","Quan Gan","Chuan Wu"],"pdf_url":"https://arxiv.org/pdf/2311.17410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17401v1","updated":"2023-11-29T07:09:25Z","published":"2023-11-29T07:09:25Z","title":"Gene-MOE: A Sparsely-gated Framework for Pan-Cancer Genomic Analysis","summary":" Analyzing the genomic information from the Pan-Cancer database can help us\nunderstand cancer-related factors and contribute to the cancer diagnosis and\nprognosis. However, existing computational methods and deep learning methods\ncan not effectively find the deep correlations between tens of thousands of\ngenes, which leads to precision loss. In this paper, we proposed a novel\npretrained model called Gene-MOE to learn the general feature representations\nof the Pan-Cancer dataset and transfer the pretrained weights to the downstream\ntasks. The Gene-MOE fully exploits the mixture of expert (MOE) layers to learn\nrich feature representations of high-dimensional genes. At the same time, we\nbuild a mixture of attention expert (MOAE) model to learn the deep semantic\nrelationships within genetic features. Finally, we proposed a new\nself-supervised pretraining strategy including loss function design, data\nenhancement, and optimization strategy to train the Gene-MOE and further\nimprove the performance for the downstream analysis. We carried out cancer\nclassification and survival analysis experiments based on the Gene-MOE.\nAccording to the survival analysis results on 14 cancer types, using Gene-MOE\noutperformed state-of-the-art models on 12 cancer types. According to the\nclassification results, the total accuracy of the classification model for 33\ncancer classifications reached 95.2\\%. Through detailed feature analysis, we\nfound the Gene-MOE model can learn rich feature representations of\nhigh-dimensional genes.\n","authors":["Xiangyu Meng","Tao Song","Qing Yang","Huanhuan Dai","Lian Qiao","Hongzhen Ding","Long Hao","Xun Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17401v1.pdf","comment":"submit to bioinformatics"},{"id":"http://arxiv.org/abs/2311.17400v1","updated":"2023-11-29T07:09:13Z","published":"2023-11-29T07:09:13Z","title":"Improving the Robustness of Transformer-based Large Language Models with\n Dynamic Attention","summary":" Transformer-based models, such as BERT and GPT, have been widely adopted in\nnatural language processing (NLP) due to their exceptional performance.\nHowever, recent studies show their vulnerability to textual adversarial attacks\nwhere the model's output can be misled by intentionally manipulating the text\ninputs. Despite various methods that have been proposed to enhance the model's\nrobustness and mitigate this vulnerability, many require heavy consumption\nresources (e.g., adversarial training) or only provide limited protection\n(e.g., defensive dropout). In this paper, we propose a novel method called\ndynamic attention, tailored for the transformer architecture, to enhance the\ninherent robustness of the model itself against various adversarial attacks.\nOur method requires no downstream task knowledge and does not incur additional\ncosts. The proposed dynamic attention consists of two modules: (I) attention\nrectification, which masks or weakens the attention value of the chosen tokens,\nand (ii) dynamic modeling, which dynamically builds the set of candidate\ntokens. Extensive experiments demonstrate that dynamic attention significantly\nmitigates the impact of adversarial attacks, improving up to 33\\% better\nperformance than previous methods against widely-used adversarial attacks. The\nmodel-level design of dynamic attention enables it to be easily combined with\nother defense methods (e.g., adversarial training) to further enhance the\nmodel's robustness. Furthermore, we demonstrate that dynamic attention\npreserves the state-of-the-art robustness space of the original model compared\nto other dynamic modeling methods.\n","authors":["Lujia Shen","Yuwen Pu","Shouling Ji","Changjiang Li","Xuhong Zhang","Chunpeng Ge","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01552v2","updated":"2023-11-29T06:40:01Z","published":"2023-04-04T06:06:59Z","title":"Meta-Learning with a Geometry-Adaptive Preconditioner","summary":" Model-agnostic meta-learning (MAML) is one of the most successful\nmeta-learning algorithms. It has a bi-level optimization structure where the\nouter-loop process learns a shared initialization and the inner-loop process\noptimizes task-specific weights. Although MAML relies on the standard gradient\ndescent in the inner-loop, recent studies have shown that controlling the\ninner-loop's gradient descent with a meta-learned preconditioner can be\nbeneficial. Existing preconditioners, however, cannot simultaneously adapt in a\ntask-specific and path-dependent way. Additionally, they do not satisfy the\nRiemannian metric condition, which can enable the steepest descent learning\nwith preconditioned gradient. In this study, we propose Geometry-Adaptive\nPreconditioned gradient descent (GAP) that can overcome the limitations in\nMAML; GAP can efficiently meta-learn a preconditioner that is dependent on\ntask-specific parameters, and its preconditioner can be shown to be a\nRiemannian metric. Thanks to the two properties, the geometry-adaptive\npreconditioner is effective for improving the inner-loop optimization.\nExperiment results show that GAP outperforms the state-of-the-art MAML family\nand preconditioned gradient descent-MAML (PGD-MAML) family in a variety of\nfew-shot learning tasks. Code is available at:\nhttps://github.com/Suhyun777/CVPR23-GAP.\n","authors":["Suhyun Kang","Duhun Hwang","Moonjung Eo","Taesup Kim","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2304.01552v2.pdf","comment":"Accepted at CVPR 2023. Code is available at:\n https://github.com/Suhyun777/CVPR23-GAP; This is an extended version of our\n previous CVPR23 work"},{"id":"http://arxiv.org/abs/2311.17373v1","updated":"2023-11-29T05:54:58Z","published":"2023-11-29T05:54:58Z","title":"The Devil is in the Data: Learning Fair Graph Neural Networks via\n Partial Knowledge Distillation","summary":" Graph neural networks (GNNs) are being increasingly used in many high-stakes\ntasks, and as a result, there is growing attention on their fairness recently.\nGNNs have been shown to be unfair as they tend to make discriminatory decisions\ntoward certain demographic groups, divided by sensitive attributes such as\ngender and race. While recent works have been devoted to improving their\nfairness performance, they often require accessible demographic information.\nThis greatly limits their applicability in real-world scenarios due to legal\nrestrictions. To address this problem, we present a demographic-agnostic method\nto learn fair GNNs via knowledge distillation, namely FairGKD. Our work is\nmotivated by the empirical observation that training GNNs on partial data\n(i.e., only node attributes or topology data) can improve their fairness,\nalbeit at the cost of utility. To make a balanced trade-off between fairness\nand utility performance, we employ a set of fairness experts (i.e., GNNs\ntrained on different partial data) to construct the synthetic teacher, which\ndistills fairer and informative knowledge to guide the learning of the GNN\nstudent. Experiments on several benchmark datasets demonstrate that FairGKD,\nwhich does not require access to demographic information, significantly\nimproves the fairness of GNNs by a large margin while maintaining their\nutility.\n","authors":["Yuchang Zhu","Jintang Li","Liang Chen","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2311.17373v1.pdf","comment":"Accepted by WSDM 2024"},{"id":"http://arxiv.org/abs/2310.18348v3","updated":"2023-11-29T05:32:24Z","published":"2023-10-23T04:35:58Z","title":"Meaning Representations from Trajectories in Autoregressive Models","summary":" We propose to extract meaning representations from autoregressive language\nmodels by considering the distribution of all possible trajectories extending\nan input text. This strategy is prompt-free, does not require fine-tuning, and\nis applicable to any pre-trained autoregressive model. Moreover, unlike\nvector-based representations, distribution-based representations can also model\nasymmetric relations (e.g., direction of logical entailment, hypernym/hyponym\nrelations) by using algebraic operations between likelihood functions. These\nideas are grounded in distributional perspectives on semantics and are\nconnected to standard constructions in automata theory, but to our knowledge\nthey have not been applied to modern language models. We empirically show that\nthe representations obtained from large models align well with human\nannotations, outperform other zero-shot and prompt-free methods on semantic\nsimilarity tasks, and can be used to solve more complex entailment and\ncontainment tasks that standard embeddings cannot handle. Finally, we extend\nour method to represent data from different modalities (e.g., image and text)\nusing multimodal autoregressive models. Our code is available at:\nhttps://github.com/tianyu139/meaning-as-trajectories\n","authors":["Tian Yu Liu","Matthew Trager","Alessandro Achille","Pramuditha Perera","Luca Zancato","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2310.18348v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09791v2","updated":"2023-11-29T05:02:39Z","published":"2023-08-18T19:40:59Z","title":"An Efficient High-Dimensional Gene Selection Approach based on Binary\n Horse Herd Optimization Algorithm for Biological Data Classification","summary":" The Horse Herd Optimization Algorithm (HOA) is a new meta-heuristic algorithm\nbased on the behaviors of horses at different ages. The HOA was introduced\nrecently to solve complex and high-dimensional problems. This paper proposes a\nbinary version of the Horse Herd Optimization Algorithm (BHOA) in order to\nsolve discrete problems and select prominent feature subsets. Moreover, this\nstudy provides a novel hybrid feature selection framework based on the BHOA and\na minimum Redundancy Maximum Relevance (MRMR) filter method. This hybrid\nfeature selection, which is more computationally efficient, produces a\nbeneficial subset of relevant and informative features. Since feature selection\nis a binary problem, we have applied a new Transfer Function (TF), called\nX-shape TF, which transforms continuous problems into binary search spaces.\nFurthermore, the Support Vector Machine (SVM) is utilized to examine the\nefficiency of the proposed method on ten microarray datasets, namely Lymphoma,\nProstate, Brain-1, DLBCL, SRBCT, Leukemia, Ovarian, Colon, Lung, and MLL. In\ncomparison to other state-of-the-art, such as the Gray Wolf (GW), Particle\nSwarm Optimization (PSO), and Genetic Algorithm (GA), the proposed hybrid\nmethod (MRMR-BHOA) demonstrates superior performance in terms of accuracy and\nminimum selected features. Also, experimental results prove that the X-Shaped\nBHOA approach outperforms others methods.\n","authors":["Niloufar Mehrabi","Sayed Pedram Haeri Boroujeni","Elnaz Pashaei"],"pdf_url":"https://arxiv.org/pdf/2308.09791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17353v1","updated":"2023-11-29T04:48:09Z","published":"2023-11-29T04:48:09Z","title":"Continuous optimization by quantum adaptive distribution search","summary":" In this paper, we introduce the quantum adaptive distribution search (QuADS),\na quantum continuous optimization algorithm that integrates Grover adaptive\nsearch (GAS) with the covariance matrix adaptation - evolution strategy\n(CMA-ES), a classical technique for continuous optimization. QuADS utilizes the\nquantum-based search capabilities of GAS and enhances them with the principles\nof CMA-ES for more efficient optimization. It employs a multivariate normal\ndistribution for the initial state of the quantum search and repeatedly updates\nit throughout the optimization process. Our numerical experiments show that\nQuADS outperforms both GAS and CMA-ES. This is achieved through adaptive\nrefinement of the initial state distribution rather than consistently using a\nuniform state, resulting in fewer oracle calls. This study presents an\nimportant step toward exploiting the potential of quantum computing for\ncontinuous optimization.\n","authors":["Kohei Morimoto","Yusuke Takase","Kosuke Mitarai","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2311.17353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17352v1","updated":"2023-11-29T04:31:35Z","published":"2023-11-29T04:31:35Z","title":"Efficient Stitchable Task Adaptation","summary":" The paradigm of pre-training and fine-tuning has laid the foundation for\ndeploying deep learning models. However, most fine-tuning methods are designed\nto meet a specific resource budget. Recently, considering diverse deployment\nscenarios with various resource budgets, stitchable neural network (SN-Net) is\nintroduced to quickly obtain numerous new networks (stitches) from the\npre-trained models (anchors) in a model family via model stitching. Although\npromising, SN-Net confronts new challenges when adapting it to new target\ndomains, including huge memory and storage requirements and a long and\nsub-optimal multistage adaptation process. In this work, we present a novel\nframework, Efficient Stitchable Task Adaptation (ESTA), to efficiently produce\na palette of fine-tuned models that adhere to diverse resource constraints.\nSpecifically, we first tailor parameter-efficient fine-tuning to share low-rank\nupdates among the stitches while maintaining independent bias terms. In this\nway, we largely reduce fine-tuning memory burdens and mitigate the interference\namong stitches that arises in task adaptation. Furthermore, we streamline a\nsimple yet effective one-stage deployment pipeline, which estimates the\nimportant stitches to deploy with training-time gradient statistics. By\nassigning higher sampling probabilities to important stitches, we also get a\nboosted Pareto frontier. Extensive experiments on 25 downstream visual\nrecognition tasks demonstrate that our ESTA is capable of generating stitches\nwith smooth accuracy-efficiency trade-offs and surpasses the direct SN-Net\nadaptation by remarkable margins with significantly lower training time and\nfewer trainable parameters. Furthermore, we demonstrate the flexibility and\nscalability of our ESTA framework by stitching LLMs from LLaMA family,\nobtaining chatbot stitches of assorted sizes.\n","authors":["Haoyu He","Zizheng Pan","Jing Liu","Jianfei Cai","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2311.17352v1.pdf","comment":"Source code will be released at\n https://github.com/ziplab/Stitched_LLaMA"},{"id":"http://arxiv.org/abs/2311.14698v2","updated":"2023-11-29T04:23:13Z","published":"2023-11-10T00:14:14Z","title":"Business Policy Experiments using Fractional Factorial Designs: Consumer\n Retention on DoorDash","summary":" This paper investigates an approach to both speed up business decision-making\nand lower the cost of learning through experimentation by factorizing business\npolicies and employing fractional factorial experimental designs for their\nevaluation. We illustrate how this method integrates with advances in the\nestimation of heterogeneous treatment effects, elaborating on its advantages\nand foundational assumptions. We empirically demonstrate the implementation and\nbenefits of our approach and assess its validity in evaluating consumer\npromotion policies at DoorDash, which is one of the largest delivery platforms\nin the US. Our approach discovers a policy with 5% incremental profit at 67%\nlower implementation cost.\n","authors":["Yixin Tang","Yicong Lin","Navdeep S. Sahni"],"pdf_url":"https://arxiv.org/pdf/2311.14698v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2006.09017v2","updated":"2023-11-29T04:20:33Z","published":"2020-06-16T09:31:58Z","title":"Estimates on Learning Rates for Multi-Penalty Distribution Regression","summary":" This paper is concerned with functional learning by utilizing two-stage\nsampled distribution regression. We study a multi-penalty regularization\nalgorithm for distribution regression under the framework of learning theory.\nThe algorithm aims at regressing to real valued outputs from probability\nmeasures. The theoretical analysis on distribution regression is far from\nmaturity and quite challenging, since only second stage samples are observable\nin practical setting. In the algorithm, to transform information from samples,\nwe embed the distributions to a reproducing kernel Hilbert space\n$\\mathcal{H}_K$ associated with Mercer kernel $K$ via mean embedding technique.\nThe main contribution of the paper is to present a novel multi-penalty\nregularization algorithm to capture more features of distribution regression\nand derive optimal learning rates for the algorithm. The work also derives\nlearning rates for distribution regression in the nonstandard setting\n$f_{\\rho}\\notin\\mathcal{H}_K$, which is not explored in existing literature.\nMoreover, we propose a distribution regression-based distributed learning\nalgorithm to face large-scale data or information challenge. The optimal\nlearning rates are derived for the distributed learning algorithm. By providing\nnew algorithms and showing their learning rates, we improve the existing work\nin different aspects in the literature.\n","authors":["Zhan Yu","Daniel W. C. Ho"],"pdf_url":"https://arxiv.org/pdf/2006.09017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01768v2","updated":"2023-11-29T03:43:56Z","published":"2023-10-03T03:32:07Z","title":"Backdiff: a diffusion model for generalized transferable protein\n backmapping","summary":" Coarse-grained (CG) models play a crucial role in the study of protein\nstructures, protein thermodynamic properties, and protein conformation\ndynamics. Due to the information loss in the coarse-graining process,\nbackmapping from CG to all-atom configurations is essential in many protein\ndesign and drug discovery applications when detailed atomic representations are\nneeded for in-depth studies. Despite recent progress in data-driven backmapping\napproaches, devising a backmapping method that can be universally applied\nacross various CG models and proteins remains unresolved. In this work, we\npropose BackDiff, a new generative model designed to achieve generalization and\nreliability in the protein backmapping problem. BackDiff leverages the\nconditional score-based diffusion model with geometric representations. Since\ndifferent CG models can contain different coarse-grained sites which include\nselected atoms (CG atoms) and simple CG auxiliary functions of atomistic\ncoordinates (CG auxiliary variables), we design a self-supervised training\nframework to adapt to different CG atoms, and constrain the diffusion sampling\npaths with arbitrary CG auxiliary variables as conditions. Our method\nfacilitates end-to-end training and allows efficient sampling across different\nproteins and diverse CG models without the need for retraining. Comprehensive\nexperiments over multiple popular CG models demonstrate BackDiff's superior\nperformance to existing state-of-the-art approaches, and generalization and\nflexibility that these approaches cannot achieve. A pretrained BackDiff model\ncan offer a convenient yet reliable plug-and-play solution for protein\nresearchers, enabling them to investigate further from their own CG models.\n","authors":["Yikai Liu","Ming Chen","Guang Lin"],"pdf_url":"https://arxiv.org/pdf/2310.01768v2.pdf","comment":"22 pages, 5 figures"},{"id":"http://arxiv.org/abs/2102.07737v4","updated":"2023-11-29T03:43:13Z","published":"2021-02-15T18:34:38Z","title":"Zero-Shot Self-Supervised Learning for MRI Reconstruction","summary":" Deep learning (DL) has emerged as a powerful tool for accelerated MRI\nreconstruction, but often necessitates a database of fully-sampled measurements\nfor training. Recent self-supervised and unsupervised learning approaches\nenable training without fully-sampled data. However, a database of undersampled\nmeasurements may not be available in many scenarios, especially for scans\ninvolving contrast or translational acquisitions in development. Moreover,\nrecent studies show that database-trained models may not generalize well when\nthe unseen measurements differ in terms of sampling pattern, acceleration rate,\nSNR, image contrast, and anatomy. Such challenges necessitate a new methodology\nto enable subject-specific DL MRI reconstruction without external training\ndatasets, since it is clinically imperative to provide high-quality\nreconstructions that can be used to identify lesions/disease for \\emph{every\nindividual}. In this work, we propose a zero-shot self-supervised learning\napproach to perform subject-specific accelerated DL MRI reconstruction to\ntackle these issues. The proposed approach partitions the available\nmeasurements from a single scan into three disjoint sets. Two of these sets are\nused to enforce data consistency and define loss during training for\nself-supervision, while the last set serves to self-validate, establishing an\nearly stopping criterion. In the presence of models pre-trained on a database\nwith different image characteristics, we show that the proposed approach can be\ncombined with transfer learning for faster convergence time and reduced\ncomputational complexity. The code is available at\n\\url{https://github.com/byaman14/ZS-SSL}.\n","authors":["Burhaneddin Yaman","Seyed Amir Hossein Hosseini","Mehmet Akçakaya"],"pdf_url":"https://arxiv.org/pdf/2102.07737v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.04869v3","updated":"2023-11-29T03:16:59Z","published":"2022-07-08T17:43:20Z","title":"Graph-based Molecular Representation Learning","summary":" Molecular representation learning (MRL) is a key step to build the connection\nbetween machine learning and chemical science. In particular, it encodes\nmolecules as numerical vectors preserving the molecular structures and\nfeatures, on top of which the downstream tasks (e.g., property prediction) can\nbe performed. Recently, MRL has achieved considerable progress, especially in\nmethods based on deep molecular graph learning. In this survey, we\nsystematically review these graph-based molecular representation techniques,\nespecially the methods incorporating chemical domain knowledge. Specifically,\nwe first introduce the features of 2D and 3D molecular graphs. Then we\nsummarize and categorize MRL methods into three groups based on their input.\nFurthermore, we discuss some typical chemical applications supported by MRL. To\nfacilitate studies in this fast-developing area, we also list the benchmarks\nand commonly used datasets in the paper. Finally, we share our thoughts on\nfuture research directions.\n","authors":["Zhichun Guo","Kehan Guo","Bozhao Nan","Yijun Tian","Roshni G. Iyer","Yihong Ma","Olaf Wiest","Xiangliang Zhang","Wei Wang","Chuxu Zhang","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2207.04869v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08745v3","updated":"2023-11-29T03:12:00Z","published":"2023-11-15T07:27:40Z","title":"Using Stochastic Gradient Descent to Smooth Nonconvex Functions:\n Analysis of Implicit Graduated Optimization with Optimal Noise Scheduling","summary":" The graduated optimization approach is a heuristic method for finding\nglobally optimal solutions for nonconvex functions and has been theoretically\nanalyzed in several studies. This paper defines a new family of nonconvex\nfunctions for graduated optimization, discusses their sufficient conditions,\nand provides a convergence analysis of the graduated optimization algorithm for\nthem. It shows that stochastic gradient descent (SGD) with mini-batch\nstochastic gradients has the effect of smoothing the function, the degree of\nwhich is determined by the learning rate and batch size. This finding provides\ntheoretical insights on why large batch sizes fall into sharp local minima, why\ndecaying learning rates and increasing batch sizes are superior to fixed\nlearning rates and batch sizes, and what the optimal learning rate scheduling\nis. To the best of our knowledge, this is the first paper to provide a\ntheoretical explanation for these aspects. Moreover, a new graduated\noptimization framework that uses a decaying learning rate and increasing batch\nsize is analyzed and experimental results of image classification that support\nour theoretical findings are reported.\n","authors":["Naoki Sato","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2311.08745v3.pdf","comment":"The latest version was updated on Nov. 29"},{"id":"http://arxiv.org/abs/2311.17327v1","updated":"2023-11-29T02:58:30Z","published":"2023-11-29T02:58:30Z","title":"Improving Self-supervised Molecular Representation Learning using\n Persistent Homology","summary":" Self-supervised learning (SSL) has great potential for molecular\nrepresentation learning given the complexity of molecular graphs, the large\namounts of unlabelled data available, the considerable cost of obtaining labels\nexperimentally, and the hence often only small training datasets. The\nimportance of the topic is reflected in the variety of paradigms and\narchitectures that have been investigated recently. Yet the differences in\nperformance seem often minor and are barely understood to date. In this paper,\nwe study SSL based on persistent homology (PH), a mathematical tool for\nmodeling topological features of data that persist across multiple scales. It\nhas several unique features which particularly suit SSL, naturally offering:\ndifferent views of the data, stability in terms of distance preservation, and\nthe opportunity to flexibly incorporate domain knowledge. We (1) investigate an\nautoencoder, which shows the general representational power of PH, and (2)\npropose a contrastive loss that complements existing approaches. We rigorously\nevaluate our approach for molecular property prediction and demonstrate its\nparticular features in improving the embedding space: after SSL, the\nrepresentations are better and offer considerably more predictive power than\nthe baselines over different probing tasks; our loss increases baseline\nperformance, sometimes largely; and we often obtain substantial improvements\nover very small datasets, a common scenario in practice.\n","authors":["Yuankai Luo","Lei Shi","Veronika Thost"],"pdf_url":"https://arxiv.org/pdf/2311.17327v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17326v1","updated":"2023-11-29T02:53:32Z","published":"2023-11-29T02:53:32Z","title":"Mostly Beneficial Clustering: Aggregating Data for Operational Decision\n Making","summary":" With increasingly volatile market conditions and rapid product innovations,\noperational decision-making for large-scale systems entails solving thousands\nof problems with limited data. Data aggregation is proposed to combine the data\nacross problems to improve the decisions obtained by solving those problems\nindividually. We propose a novel cluster-based shrunken-SAA approach that can\nexploit the cluster structure among problems when implementing the data\naggregation approaches. We prove that, as the number of problems grows,\nleveraging the known cluster structure among problems yields additional\nbenefits over the data aggregation approaches that neglect such structure. When\nthe cluster structure is unknown, we show that unveiling the cluster structure,\neven at the cost of a few data points, can be beneficial, especially when the\ndistance between clusters of problems is substantial. Our proposed approach can\nbe extended to general cost functions under mild conditions. When the number of\nproblems gets large, the optimality gap of our proposed approach decreases\nexponentially in the distance between the clusters. We explore the performance\nof the proposed approach through the application of managing newsvendor systems\nvia numerical experiments. We investigate the impacts of distance metrics\nbetween problem instances on the performance of the cluster-based Shrunken-SAA\napproach with synthetic data. We further validate our proposed approach with\nreal data and highlight the advantages of cluster-based data aggregation,\nespecially in the small-data large-scale regime, compared to the existing\napproaches.\n","authors":["Chengzhang Li","Zhenkang Peng","Ying Rong"],"pdf_url":"https://arxiv.org/pdf/2311.17326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15646v4","updated":"2023-11-29T02:42:18Z","published":"2022-11-28T18:52:33Z","title":"Beyond Invariance: Test-Time Label-Shift Adaptation for Distributions\n with \"Spurious\" Correlations","summary":" Changes in the data distribution at test time can have deleterious effects on\nthe performance of predictive models $p(y|x)$. We consider situations where\nthere are additional meta-data labels (such as group labels), denoted by $z$,\nthat can account for such changes in the distribution. In particular, we assume\nthat the prior distribution $p(y, z)$, which models the dependence between the\nclass label $y$ and the \"nuisance\" factors $z$, may change across domains,\neither due to a change in the correlation between these terms, or a change in\none of their marginals. However, we assume that the generative model for\nfeatures $p(x|y,z)$ is invariant across domains. We note that this corresponds\nto an expanded version of the widely used \"label shift\" assumption, where the\nlabels now also include the nuisance factors $z$. Based on this observation, we\npropose a test-time label shift correction that adapts to changes in the joint\ndistribution $p(y, z)$ using EM applied to unlabeled samples from the target\ndomain distribution, $p_t(x)$. Importantly, we are able to avoid fitting a\ngenerative model $p(x|y, z)$, and merely need to reweight the outputs of a\ndiscriminative model $p_s(y, z|x)$ trained on the source distribution. We\nevaluate our method, which we call \"Test-Time Label-Shift Adaptation\" (TTLSA),\non several standard image and text datasets, as well as the CheXpert chest\nX-ray dataset, and show that it improves performance over methods that target\ninvariance to changes in the distribution, as well as baseline empirical risk\nminimization methods. Code for reproducing experiments is available at\nhttps://github.com/nalzok/test-time-label-shift .\n","authors":["Qingyao Sun","Kevin Murphy","Sayna Ebrahimi","Alexander D'Amour"],"pdf_url":"https://arxiv.org/pdf/2211.15646v4.pdf","comment":"24 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.17323v1","updated":"2023-11-29T02:40:12Z","published":"2023-11-29T02:40:12Z","title":"Accelerating DNN Training With Photonics: A Residue Number System-Based\n Design","summary":" Photonic computing is a compelling avenue for performing highly efficient\nmatrix multiplication, a crucial operation in Deep Neural Networks (DNNs).\nWhile this method has shown great success in DNN inference, meeting the high\nprecision demands of DNN training proves challenging due to the precision\nlimitations imposed by costly data converters and the analog noise inherent in\nphotonic hardware. This paper proposes Mirage, a photonic DNN training\naccelerator that overcomes the precision challenges in photonic hardware using\nthe Residue Number System (RNS). RNS is a numeral system based on modular\narithmetic$\\unicode{x2014}$allowing us to perform high-precision operations via\nmultiple low-precision modular operations. In this work, we present a novel\nmicro-architecture and dataflow for an RNS-based photonic tensor core\nperforming modular arithmetic in the analog domain. By combining RNS and\nphotonics, Mirage provides high energy efficiency without compromising\nprecision and can successfully train state-of-the-art DNNs achieving accuracy\ncomparable to FP32 training. Our study shows that on average across several\nDNNs when compared to systolic arrays, Mirage achieves more than $23.8\\times$\nfaster training and $32.1\\times$ lower EDP in an iso-energy scenario and\nconsumes $42.8\\times$ lower power with comparable or better EDP in an iso-area\nscenario.\n","authors":["Cansu Demirkiran","Guowei Yang","Darius Bunandar","Ajay Joshi"],"pdf_url":"https://arxiv.org/pdf/2311.17323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11254v3","updated":"2023-11-29T02:32:02Z","published":"2023-11-19T06:44:13Z","title":"BOIS: Bayesian Optimization of Interconnected Systems","summary":" Bayesian optimization (BO) has proven to be an effective paradigm for the\nglobal optimization of expensive-to-sample systems. One of the main advantages\nof BO is its use of Gaussian processes (GPs) to characterize model uncertainty\nwhich can be leveraged to guide the learning and search process. However, BO\ntypically treats systems as black-boxes and this limits the ability to exploit\nstructural knowledge (e.g., physics and sparse interconnections). Composite\nfunctions of the form $f(x, y(x))$, wherein GP modeling is shifted from the\nperformance function $f$ to an intermediate function $y$, offer an avenue for\nexploiting structural knowledge. However, the use of composite functions in a\nBO framework is complicated by the need to generate a probability density for\n$f$ from the Gaussian density of $y$ calculated by the GP (e.g., when $f$ is\nnonlinear it is not possible to obtain a closed-form expression). Previous work\nhas handled this issue using sampling techniques; these are easy to implement\nand flexible but are computationally intensive. In this work, we introduce a\nnew paradigm which allows for the efficient use of composite functions in BO;\nthis uses adaptive linearizations of $f$ to obtain closed-form expressions for\nthe statistical moments of the composite function. We show that this simple\napproach (which we call BOIS) enables the exploitation of structural knowledge,\nsuch as that arising in interconnected systems as well as systems that embed\nmultiple GP models and combinations of physics and GP models. Using a chemical\nprocess optimization case study, we benchmark the effectiveness of BOIS against\nstandard BO and sampling approaches. Our results indicate that BOIS achieves\nperformance gains and accurately captures the statistics of composite\nfunctions.\n","authors":["Leonardo D. González","Victor M. Zavala"],"pdf_url":"https://arxiv.org/pdf/2311.11254v3.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.18409v3","updated":"2023-11-29T02:19:00Z","published":"2023-05-28T16:13:59Z","title":"Direction-oriented Multi-objective Learning: Simple and Provable\n Stochastic Algorithms","summary":" Multi-objective optimization (MOO) has become an influential framework in\nmany machine learning problems with multiple objectives such as learning with\nmultiple criteria and multi-task learning (MTL). In this paper, we propose a\nnew direction-oriented multi-objective problem by regularizing the common\ndescent direction within a neighborhood of a direction that optimizes a linear\ncombination of objectives such as the average loss in MTL. This formulation\nincludes GD and MGDA as special cases, enjoys the direction-oriented benefit as\nin CAGrad, and facilitates the design of stochastic algorithms. To solve this\nproblem, we propose Stochastic Direction-oriented Multi-objective Gradient\ndescent (SDMGrad) with simple SGD type of updates, and its variant SDMGrad-OS\nwith an efficient objective sampling in the setting where the number of\nobjectives is large. For a constant-level regularization parameter $\\lambda$,\nwe show that SDMGrad and SDMGrad-OS provably converge to a Pareto stationary\npoint with improved complexities and milder assumptions. For an increasing\n$\\lambda$, this convergent point reduces to a stationary point of the linear\ncombination of objectives. We demonstrate the superior performance of the\nproposed methods in a series of tasks on multi-task supervised learning and\nreinforcement learning. Code is provided at\nhttps://github.com/ml-opt-lab/sdmgrad.\n","authors":["Peiyao Xiao","Hao Ban","Kaiyi Ji"],"pdf_url":"https://arxiv.org/pdf/2305.18409v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00290v2","updated":"2023-11-29T01:55:38Z","published":"2023-11-01T04:41:25Z","title":"Inference of CO2 flow patterns -- a feasibility study","summary":" As the global deployment of carbon capture and sequestration (CCS) technology\nintensifies in the fight against climate change, it becomes increasingly\nimperative to establish robust monitoring and detection mechanisms for\npotential underground CO2 leakage, particularly through pre-existing or induced\nfaults in the storage reservoir's seals. While techniques such as history\nmatching and time-lapse seismic monitoring of CO2 storage have been used\nsuccessfully in tracking the evolution of CO2 plumes in the subsurface, these\nmethods lack principled approaches to characterize uncertainties related to the\nCO2 plumes' behavior. Inclusion of systematic assessment of uncertainties is\nessential for risk mitigation for the following reasons: (i) CO2 plume-induced\nchanges are small and seismic data is noisy; (ii) changes between regular and\nirregular (e.g., caused by leakage) flow patterns are small; and (iii) the\nreservoir properties that control the flow are strongly heterogeneous and\ntypically only available as distributions. To arrive at a formulation capable\nof inferring flow patterns for regular and irregular flow from well and seismic\ndata, the performance of conditional normalizing flow will be analyzed on a\nseries of carefully designed numerical experiments. While the inferences\npresented are preliminary in the context of an early CO2 leakage detection\nsystem, the results do indicate that inferences with conditional normalizing\nflows can produce high-fidelity estimates for CO2 plumes with or without\nleakage. We are also confident that the inferred uncertainty is reasonable\nbecause it correlates well with the observed errors. This uncertainty stems\nfrom noise in the seismic data and from the lack of precise knowledge of the\nreservoir's fluid flow properties.\n","authors":["Abhinav Prakash Gahlot","Huseyin Tuna Erdinc","Rafael Orozco","Ziyi Yin","Felix J. Herrmann"],"pdf_url":"https://arxiv.org/pdf/2311.00290v2.pdf","comment":"Accepted in NeurIPS 2023 Workshop - Tackling Climate Change with\n Machine Learning (Spotlight)"},{"id":"http://arxiv.org/abs/2311.16203v2","updated":"2023-11-29T01:53:46Z","published":"2023-11-27T08:52:10Z","title":"ChatTraffic: Text-to-Traffic Generation via Diffusion Model","summary":" Traffic prediction is one of the most significant foundations in Intelligent\nTransportation Systems (ITS). Traditional traffic prediction methods rely only\non historical traffic data to predict traffic trends and face two main\nchallenges. 1) insensitivity to unusual events. 2) poor performance in\nlong-term prediction. In this work, we explore how generative models combined\nwith text describing the traffic system can be applied for traffic generation\nand name the task Text-to-Traffic Generation (TTG). The key challenge of the\nTTG task is how to associate text with the spatial structure of the road\nnetwork and traffic data for generating traffic situations. To this end, we\npropose ChatTraffic, the first diffusion model for text-to-traffic generation.\nTo guarantee the consistency between synthetic and real data, we augment a\ndiffusion model with the Graph Convolutional Network (GCN) to extract spatial\ncorrelations of traffic data. In addition, we construct a large dataset\ncontaining text-traffic pairs for the TTG task. We benchmarked our model\nqualitatively and quantitatively on the released dataset. The experimental\nresults indicate that ChatTraffic can generate realistic traffic situations\nfrom the text. Our code and dataset are available at\nhttps://github.com/ChyaZhang/ChatTraffic.\n","authors":["Chengyang Zhang","Yong Zhang","Qitan Shao","Bo Li","Yisheng Lv","Xinglin Piao","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2311.16203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00489v3","updated":"2023-11-29T01:48:10Z","published":"2023-09-30T20:59:42Z","title":"Dynamic DAG Discovery for Interpretable Imitation Learning","summary":" Imitation learning, which learns agent policy by mimicking expert\ndemonstration, has shown promising results in many applications such as medical\ntreatment regimes and self-driving vehicles. However, it remains a difficult\ntask to interpret control policies learned by the agent. Difficulties mainly\ncome from two aspects: 1) agents in imitation learning are usually implemented\nas deep neural networks, which are black-box models and lack interpretability;\n2) the latent causal mechanism behind agents' decisions may vary along the\ntrajectory, rather than staying static throughout time steps. To increase\ntransparency and offer better interpretability of the neural agent, we propose\nto expose its captured knowledge in the form of a directed acyclic causal\ngraph, with nodes being action and state variables and edges denoting the\ncausal relations behind predictions. Furthermore, we design this causal\ndiscovery process to be state-dependent, enabling it to model the dynamics in\nlatent causal graphs. Concretely, we conduct causal discovery from the\nperspective of Granger causality and propose a self-explainable imitation\nlearning framework, {\\method}. The proposed framework is composed of three\nparts: a dynamic causal discovery module, a causality encoding module, and a\nprediction module, and is trained in an end-to-end manner. After the model is\nlearned, we can obtain causal relations among states and action variables\nbehind its decisions, exposing policies learned by it. Experimental results on\nboth synthetic and real-world datasets demonstrate the effectiveness of the\nproposed {\\method} in learning the dynamic causal graphs for understanding the\ndecision-making of imitation learning meanwhile maintaining high prediction\naccuracy.\n","authors":["Tianxiang Zhao","Wenchao Yu","Suhang Wang","Lu Wang","Xiang Zhang","Yuncong Chen","Yanchi Liu","Wei Cheng","Haifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2310.00489v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08812v4","updated":"2023-11-29T01:41:23Z","published":"2023-10-13T01:50:43Z","title":"A novel decomposed-ensemble time series forecasting framework: capturing\n underlying volatility information","summary":" Time series forecasting represents a significant and challenging task across\nvarious fields. Recently, methods based on mode decomposition have dominated\nthe forecasting of complex time series because of the advantages of capturing\nlocal characteristics and extracting intrinsic modes from data. Unfortunately,\nmost models fail to capture the implied volatilities that contain significant\ninformation. To enhance the prediction of contemporary diverse and complex time\nseries, we propose a novel time series forecasting paradigm that integrates\ndecomposition with the capability to capture the underlying fluctuation\ninformation of the series. In our methodology, we implement the Variational\nMode Decomposition algorithm to decompose the time series into K distinct\nsub-modes. Following this decomposition, we apply the Generalized\nAutoregressive Conditional Heteroskedasticity (GARCH) model to extract the\nvolatility information in these sub-modes. Subsequently, both the numerical\ndata and the volatility information for each sub-mode are harnessed to train a\nneural network. This network is adept at predicting the information of the\nsub-modes, and we aggregate the predictions of all sub-modes to generate the\nfinal output. By integrating econometric and artificial intelligence methods,\nand taking into account both the numerical and volatility information of the\ntime series, our proposed framework demonstrates superior performance in time\nseries forecasting, as evidenced by the significant decrease in MSE, RMSE, and\nMAPE in our comparative experimental results.\n","authors":["Zhengtao Gui","Haoyuan Li","Sijie Xu","Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.08812v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05866v2","updated":"2023-11-29T01:30:12Z","published":"2023-10-09T17:03:08Z","title":"Generative quantum machine learning via denoising diffusion\n probabilistic models","summary":" Deep generative models are key-enabling technology to computer vision, text\ngeneration and large language models. Denoising diffusion probabilistic models\n(DDPMs) have recently gained much attention due to their ability to generate\ndiverse and high-quality samples in many computer vision tasks, as well as to\nincorporate flexible model architectures and relatively simple training scheme.\nQuantum generative models, empowered by entanglement and superposition, have\nbrought new insight to learning classical and quantum data. Inspired by the\nclassical counterpart, we propose the quantum denoising diffusion probabilistic\nmodels (QuDDPM) to enable efficiently trainable generative learning of quantum\ndata. QuDDPM adopts sufficient layers of circuits to guarantee expressivity,\nwhile introduces multiple intermediate training tasks as interpolation between\nthe target distribution and noise to avoid barren plateau and guarantee\nefficient training. We provide bounds on the learning error and demonstrate\nQuDDPM's capability in learning correlated quantum noise model, quantum\nmany-body phases and topological structure of quantum data. The results provide\na paradigm for versatile and efficient quantum generative learning.\n","authors":["Bingzhi Zhang","Peng Xu","Xiaohui Chen","Quntao Zhuang"],"pdf_url":"https://arxiv.org/pdf/2310.05866v2.pdf","comment":"5+7 pages, 11 figures"},{"id":"http://arxiv.org/abs/2311.17303v1","updated":"2023-11-29T01:25:00Z","published":"2023-11-29T01:25:00Z","title":"Enhancing the Performance of Neural Networks Through Causal Discovery\n and Integration of Domain Knowledge","summary":" In this paper, we develop a generic methodology to encode hierarchical\ncausality structure among observed variables into a neural network in order to\nimprove its predictive performance. The proposed methodology, called\ncausality-informed neural network (CINN), leverages three coherent steps to\nsystematically map the structural causal knowledge into the layer-to-layer\ndesign of neural network while strictly preserving the orientation of every\ncausal relationship. In the first step, CINN discovers causal relationships\nfrom observational data via directed acyclic graph (DAG) learning, where causal\ndiscovery is recast as a continuous optimization problem to avoid the\ncombinatorial nature. In the second step, the discovered hierarchical causality\nstructure among observed variables is systematically encoded into neural\nnetwork through a dedicated architecture and customized loss function. By\ncategorizing variables in the causal DAG as root, intermediate, and leaf nodes,\nthe hierarchical causal DAG is translated into CINN with a one-to-one\ncorrespondence between nodes in the causal DAG and units in the CINN while\nmaintaining the relative order among these nodes. Regarding the loss function,\nboth intermediate and leaf nodes in the DAG graph are treated as target outputs\nduring CINN training so as to drive co-learning of causal relationships among\ndifferent types of nodes. As multiple loss components emerge in CINN, we\nleverage the projection of conflicting gradients to mitigate gradient\ninterference among the multiple learning tasks. Computational experiments\nacross a broad spectrum of UCI data sets demonstrate substantial advantages of\nCINN in predictive performance over other state-of-the-art methods. In\naddition, an ablation study underscores the value of integrating structural and\nquantitative causal knowledge in enhancing the neural network's predictive\nperformance incrementally.\n","authors":["Xiaoge Zhang","Xiao-Lin Wang","Fenglei Fan","Yiu-Ming Cheung","Indranil Bose"],"pdf_url":"https://arxiv.org/pdf/2311.17303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17301v1","updated":"2023-11-29T01:19:02Z","published":"2023-11-29T01:19:02Z","title":"Language Models: A Guide for the Perplexed","summary":" Given the growing importance of AI literacy, we decided to write this\ntutorial to help narrow the gap between the discourse among those who study\nlanguage models -- the core technology underlying ChatGPT and similar products\n-- and those who are intrigued and want to learn more about them. In short, we\nbelieve the perspective of researchers and educators can add some clarity to\nthe public's understanding of the technologies beyond what's currently\navailable, which tends to be either extremely technical or promotional material\ngenerated about products by their purveyors.\n Our approach teases apart the concept of a language model from products built\non them, from the behaviors attributed to or desired from those products, and\nfrom claims about similarity to human cognition. As a starting point, we (1)\noffer a scientific viewpoint that focuses on questions amenable to study\nthrough experimentation; (2) situate language models as they are today in the\ncontext of the research that led to their development; and (3) describe the\nboundaries of what is known about the models at this writing.\n","authors":["Sofia Serrano","Zander Brumbaugh","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2311.17301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06753v2","updated":"2023-11-29T01:17:18Z","published":"2023-03-12T21:01:54Z","title":"Modular Quantization-Aware Training: Increasing Accuracy by Decreasing\n Precision in 6D Object Pose Estimation","summary":" Edge applications, such as collaborative robotics and spacecraft rendezvous,\ndemand efficient 6D object pose estimation on resource-constrained embedded\nplatforms. Existing 6D pose estimation networks are often too large for such\ndeployments, necessitating compression while maintaining reliable performance.\nTo address this challenge, we introduce Modular Quantization-Aware Training\n(MQAT), an adaptive and mixed-precision quantization-aware training strategy\nthat exploits the modular structure of modern 6D pose estimation architectures.\nMQAT guides a systematic gradated modular quantization sequence and determines\nmodule-specific bit precisions, leading to quantized models that outperform\nthose produced by state-of-the-art uniform and mixed-precision quantization\ntechniques. Our experiments showcase the generality of MQAT across datasets,\narchitectures, and quantization algorithms. Remarkably, MQAT-trained quantized\nmodels achieve a significant accuracy boost (>7%) over the baseline\nfull-precision network while reducing model size by a factor of 4x or more.\n","authors":["Saqib Javed","Chengkun Li","Andrew Price","Yinlin Hu","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2303.06753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17299v1","updated":"2023-11-29T01:10:39Z","published":"2023-11-29T01:10:39Z","title":"Federated Fine-Tuning of Foundation Models via Probabilistic Masking","summary":" Foundation Models (FMs) have revolutionized machine learning with their\nadaptability and high performance across tasks; yet, their integration into\nFederated Learning (FL) is challenging due to substantial communication\noverhead from their extensive parameterization. Current communication-efficient\nFL strategies, such as gradient compression, reduce bitrates to around $1$\nbit-per-parameter (bpp). However, these approaches fail to harness the\ncharacteristics of FMs, with their large number of parameters still posing a\nchallenge to communication efficiency, even at these bitrate regimes. In this\nwork, we present DeltaMask, a novel method that efficiently fine-tunes FMs in\nFL at an ultra-low bitrate, well below 1 bpp. DeltaMask employs stochastic\nmasking to detect highly effective subnetworks within FMs and leverage\nstochasticity and sparsity in client masks to compress updates into a compact\ngrayscale image using probabilistic filters, deviating from traditional weight\ntraining approaches. Our comprehensive evaluations across various datasets and\narchitectures demonstrate DeltaMask efficiently achieves bitrates as low as\n0.09 bpp, enhancing communication efficiency while maintaining FMs performance,\nas measured on 8 datasets and 5 pre-trained models of various network\narchitectures.\n","authors":["Vasileios Tsouvalas","Yuki Asano","Aaqib Saeed"],"pdf_url":"https://arxiv.org/pdf/2311.17299v1.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.08235v2","updated":"2023-11-29T01:07:39Z","published":"2023-10-12T11:31:01Z","title":"GROOT: Learning to Follow Instructions by Watching Gameplay Videos","summary":" We study the problem of building a controller that can follow open-ended\ninstructions in open-world environments. We propose to follow reference videos\nas instructions, which offer expressive goal specifications while eliminating\nthe need for expensive text-gameplay annotations. A new learning framework is\nderived to allow learning such instruction-following controllers from gameplay\nvideos while producing a video instruction encoder that induces a structured\ngoal space. We implement our agent GROOT in a simple yet effective\nencoder-decoder architecture based on causal transformers. We evaluate GROOT\nagainst open-world counterparts and human players on a proposed Minecraft\nSkillForge benchmark. The Elo ratings clearly show that GROOT is closing the\nhuman-machine gap as well as exhibiting a 70% winning rate over the best\ngeneralist agent baseline. Qualitative analysis of the induced goal space\nfurther demonstrates some interesting emergent properties, including the goal\ncomposition and complex gameplay behavior synthesis. The project page is\navailable at https://craftjarvis-groot.github.io.\n","authors":["Shaofei Cai","Bowei Zhang","Zihao Wang","Xiaojian Ma","Anji Liu","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2310.08235v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17287v1","updated":"2023-11-29T00:14:30Z","published":"2023-11-29T00:14:30Z","title":"Utilizing Model Residuals to Identify Rental Properties of Interest: The\n Price Anomaly Score (PAS) and Its Application to Real-time Data in Manhattan","summary":" Understanding whether a property is priced fairly hinders buyers and sellers\nsince they usually do not have an objective viewpoint of the price distribution\nfor the overall market of their interest. Drawing from data collected of all\npossible available properties for rent in Manhattan as of September 2023, this\npaper aims to strengthen our understanding of model residuals; specifically on\nmachine learning models which generalize for a majority of the distribution of\na well-proportioned dataset. Most models generally perceive deviations from\npredicted values as mere inaccuracies, however this paper proposes a different\nvantage point: when generalizing to at least 75\\% of the data-set, the\nremaining deviations reveal significant insights. To harness these insights, we\nintroduce the Price Anomaly Score (PAS), a metric capable of capturing\nboundaries between irregularly predicted prices. By combining relative pricing\ndiscrepancies with statistical significance, the Price Anomaly Score (PAS)\noffers a multifaceted view of rental valuations. This metric allows experts to\nidentify overpriced or underpriced properties within a dataset by aggregating\nPAS values, then fine-tuning upper and lower boundaries to any threshold to set\nindicators of choice.\n","authors":["Youssef Sultan","Jackson C. Rafter","Huyen T. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2311.17287v1.pdf","comment":"8 pages, 8 figures, dataset is available with DOI"},{"id":"http://arxiv.org/abs/2311.11772v3","updated":"2023-11-29T00:06:13Z","published":"2023-11-20T13:58:26Z","title":"A Good Feature Extractor Is All You Need for Weakly Supervised Learning\n in Histopathology","summary":" Deep learning is revolutionising pathology, offering novel opportunities in\ndisease prognosis and personalised treatment. Historically, stain normalisation\nhas been a crucial preprocessing step in computational pathology pipelines, and\npersists into the deep learning era. Yet, with the emergence of feature\nextractors trained using self-supervised learning (SSL) on diverse pathology\ndatasets, we call this practice into question. In an empirical evaluation of\npublicly available feature extractors, we find that omitting stain\nnormalisation and image augmentations does not compromise downstream\nperformance, while incurring substantial savings in memory and compute.\nFurther, we show that the top-performing feature extractors are remarkably\nrobust to variations in stain and augmentations like rotation in their latent\nspace. Contrary to previous patch-level benchmarking studies, our approach\nemphasises clinical relevance by focusing on slide-level prediction tasks in a\nweakly supervised setting with external validation cohorts. This work\nrepresents the most comprehensive robustness evaluation of public pathology SSL\nfeature extractors to date, involving more than 6,000 training runs across nine\ntasks, five datasets, three downstream architectures, and various preprocessing\nsetups. Our findings stand to streamline digital pathology workflows by\nminimising preprocessing needs and informing the selection of feature\nextractors.\n","authors":["Georg Wölflein","Dyke Ferber","Asier Rabasco Meneghetti","Omar S. M. El Nahhas","Daniel Truhn","Zunamys I. Carrero","David J. Harrison","Ognjen Arandjelović","Jakob N. Kather"],"pdf_url":"https://arxiv.org/pdf/2311.11772v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.17754v1","updated":"2023-11-29T15:56:58Z","published":"2023-11-29T15:56:58Z","title":"Cinematic Behavior Transfer via NeRF-based Differentiable Filming","summary":" In the evolving landscape of digital media and video production, the precise\nmanipulation and reproduction of visual elements like camera movements and\ncharacter actions are highly desired. Existing SLAM methods face limitations in\ndynamic scenes and human pose estimation often focuses on 2D projections,\nneglecting 3D statuses. To address these issues, we first introduce a reverse\nfilming behavior estimation technique. It optimizes camera trajectories by\nleveraging NeRF as a differentiable renderer and refining SMPL tracks. We then\nintroduce a cinematic transfer pipeline that is able to transfer various shot\ntypes to a new 2D video or a 3D virtual environment. The incorporation of 3D\nengine workflow enables superior rendering and control abilities, which also\nachieves a higher rating in the user study.\n","authors":["Xuekun Jiang","Anyi Rao","Jingbo Wang","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2311.17754v1.pdf","comment":"Project Page:\n https://virtualfilmstudio.github.io/projects/cinetransfer"},{"id":"http://arxiv.org/abs/2311.17752v1","updated":"2023-11-29T15:56:31Z","published":"2023-11-29T15:56:31Z","title":"BAND-2k: Banding Artifact Noticeable Database for Banding Detection and\n Quality Assessment","summary":" Banding, also known as staircase-like contours, frequently occurs in flat\nareas of images/videos processed by the compression or quantization algorithms.\nAs undesirable artifacts, banding destroys the original image structure, thus\ndegrading users' quality of experience (QoE). In this paper, we systematically\ninvestigate the banding image quality assessment (IQA) problem, aiming to\ndetect the image banding artifacts and evaluate their perceptual visual\nquality. Considering that the existing image banding databases only contain\nlimited content sources and banding generation methods, and lack perceptual\nquality labels (i.e. mean opinion scores), we first build the largest banding\nIQA database so far, named Banding Artifact Noticeable Database (BAND-2k),\nwhich consists of 2,000 banding images generated by 15 compression and\nquantization schemes. A total of 23 workers participated in the subjective IQA\nexperiment, yielding over 214,000 patch-level banding class labels and 44,371\nreliable image-level quality ratings. Subsequently, we develop an effective\nno-reference (NR) banding evaluator for banding detection and quality\nassessment by leveraging frequency characteristics of banding artifacts. A dual\nconvolutional neural network is employed to concurrently learn the feature\nrepresentation from the high-frequency and low-frequency maps, thereby\nenhancing the ability to discern banding artifacts. The quality score of a\nbanding image is generated by pooling the banding detection maps masked by the\nspatial frequency filters. Experiments demonstrate that our banding evaluator\nachieves a remarkably high accuracy in banding detection and also exhibits high\nSRCC and PLCC results with the perceptual quality labels. These findings unveil\nthe strong correlations between the intensity of banding artifacts and the\nperceptual visual quality, thus validating the necessity of banding quality\nassessment.\n","authors":["Zijian Chen","Wei Sun","Jun Jia","Fangfang Lu","Zicheng Zhang","Jing Liu","Ru Huang","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2311.17752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17655v1","updated":"2023-11-29T14:18:04Z","published":"2023-11-29T14:18:04Z","title":"Vulnerability of Automatic Identity Recognition to Audio-Visual\n Deepfakes","summary":" The task of deepfakes detection is far from being solved by speech or vision\nresearchers. Several publicly available databases of fake synthetic video and\nspeech were built to aid the development of detection methods. However,\nexisting databases typically focus on visual or voice modalities and provide no\nproof that their deepfakes can in fact impersonate any real person. In this\npaper, we present the first realistic audio-visual database of deepfakes\nSWAN-DF, where lips and speech are well synchronized and video have high visual\nand audio qualities. We took the publicly available SWAN dataset of real videos\nwith different identities to create audio-visual deepfakes using several models\nfrom DeepFaceLab and blending techniques for face swapping and HiFiVC, DiffVC,\nYourTTS, and FreeVC models for voice conversion. From the publicly available\nspeech dataset LibriTTS, we also created a separate database of only audio\ndeepfakes LibriTTS-DF using several latest text to speech methods: YourTTS,\nAdaspeech, and TorToiSe. We demonstrate the vulnerability of a state of the art\nspeaker recognition system, such as ECAPA-TDNN-based model from SpeechBrain, to\nthe synthetic voices. Similarly, we tested face recognition system based on the\nMobileFaceNet architecture to several variants of our visual deepfakes. The\nvulnerability assessment show that by tuning the existing pretrained deepfake\nmodels to specific identities, one can successfully spoof the face and speaker\nrecognition systems in more than 90% of the time and achieve a very realistic\nlooking and sounding fake video of a given person.\n","authors":["Pavel Korshunov","Haolin Chen","Philip N. Garner","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2311.17655v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2210.04671v3","updated":"2023-11-29T13:20:22Z","published":"2022-10-10T13:20:51Z","title":"TCDM: Transformational Complexity Based Distortion Metric for Perceptual\n Point Cloud Quality Assessment","summary":" The goal of objective point cloud quality assessment (PCQA) research is to\ndevelop quantitative metrics that measure point cloud quality in a perceptually\nconsistent manner. Merging the research of cognitive science and intuition of\nthe human visual system (HVS), in this paper, we evaluate the point cloud\nquality by measuring the complexity of transforming the distorted point cloud\nback to its reference, which in practice can be approximated by the code length\nof one point cloud when the other is given. For this purpose, we first make\nspace segmentation for the reference and distorted point clouds based on a 3D\nVoronoi diagram to obtain a series of local patch pairs. Next, inspired by the\npredictive coding theory, we utilize a space-aware vector autoregressive\n(SA-VAR) model to encode the geometry and color channels of each reference\npatch with and without the distorted patch, respectively. Assuming that the\nresidual errors follow the multi-variate Gaussian distributions, the\nself-complexity of the reference and transformational complexity between the\nreference and distorted samples are computed using covariance matrices.\nAdditionally, the prediction terms generated by SA-VAR are introduced as one\nauxiliary feature to promote the final quality prediction. The effectiveness of\nthe proposed transformational complexity based distortion metric (TCDM) is\nevaluated through extensive experiments conducted on five public point cloud\nquality assessment databases. The results demonstrate that TCDM achieves\nstate-of-the-art (SOTA) performance, and further analysis confirms its\nrobustness in various scenarios. The code is publicly available at\nhttps://github.com/zyj1318053/TCDM.\n","authors":["Yujie Zhang","Qi Yang","Yifei Zhou","Xiaozhong Xu","Le Yang","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2210.04671v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17350v1","updated":"2023-11-29T04:15:57Z","published":"2023-11-29T04:15:57Z","title":"Implicit-explicit Integrated Representations for Multi-view Video\n Compression","summary":" With the increasing consumption of 3D displays and virtual reality,\nmulti-view video has become a promising format. However, its high resolution\nand multi-camera shooting result in a substantial increase in data volume,\nmaking storage and transmission a challenging task. To tackle these\ndifficulties, we propose an implicit-explicit integrated representation for\nmulti-view video compression. Specifically, we first use the explicit\nrepresentation-based 2D video codec to encode one of the source views.\nSubsequently, we propose employing the implicit neural representation\n(INR)-based codec to encode the remaining views. The implicit codec takes the\ntime and view index of multi-view video as coordinate inputs and generates the\ncorresponding implicit reconstruction frames.To enhance the compressibility, we\nintroduce a multi-level feature grid embedding and a fully convolutional\narchitecture into the implicit codec. These components facilitate\ncoordinate-feature and feature-RGB mapping, respectively. To further enhance\nthe reconstruction quality from the INR codec, we leverage the high-quality\nreconstructed frames from the explicit codec to achieve inter-view\ncompensation. Finally, the compensated results are fused with the implicit\nreconstructions from the INR to obtain the final reconstructed frames. Our\nproposed framework combines the strengths of both implicit neural\nrepresentation and explicit 2D codec. Extensive experiments conducted on public\ndatasets demonstrate that the proposed framework can achieve comparable or even\nsuperior performance to the latest multi-view video compression standard MIV\nand other INR-based schemes in terms of view compression and scene modeling.\n","authors":["Chen Zhu","Guo Lu","Bing He","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2311.17350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17335v1","updated":"2023-11-29T03:24:30Z","published":"2023-11-29T03:24:30Z","title":"eMotions: A Large-Scale Dataset for Emotion Recognition in Short Videos","summary":" Nowadays, short videos (SVs) are essential to information acquisition and\nsharing in our life. The prevailing use of SVs to spread emotions leads to the\nnecessity of emotion recognition in SVs. Considering the lack of SVs emotion\ndata, we introduce a large-scale dataset named eMotions, comprising 27,996\nvideos. Meanwhile, we alleviate the impact of subjectivities on labeling\nquality by emphasizing better personnel allocations and multi-stage\nannotations. In addition, we provide the category-balanced and test-oriented\nvariants through targeted data sampling. Some commonly used videos (e.g.,\nfacial expressions and postures) have been well studied. However, it is still\nchallenging to understand the emotions in SVs. Since the enhanced content\ndiversity brings more distinct semantic gaps and difficulties in learning\nemotion-related features, and there exists information gaps caused by the\nemotion incompleteness under the prevalently audio-visual co-expressions. To\ntackle these problems, we present an end-to-end baseline method AV-CPNet that\nemploys the video transformer to better learn semantically relevant\nrepresentations. We further design the two-stage cross-modal fusion module to\ncomplementarily model the correlations of audio-visual features. The EP-CE\nLoss, incorporating three emotion polarities, is then applied to guide model\noptimization. Extensive experimental results on nine datasets verify the\neffectiveness of AV-CPNet. Datasets and code will be open on\nhttps://github.com/XuecWu/eMotions.\n","authors":["Xuecheng Wu","Heli Sun","Junxiao Xue","Ruofan Zhai","Xiangyan Kong","Jiayu Nie","Liang He"],"pdf_url":"https://arxiv.org/pdf/2311.17335v1.pdf","comment":null}]},"2023-11-30T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.17898v2","updated":"2023-11-30T18:59:01Z","published":"2023-11-29T18:51:46Z","title":"Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis","summary":" Hallucinations and unfaithful synthesis due to inaccurate prompts with\ninsufficient semantic details are widely observed in multimodal generative\nmodels. A prevalent strategy to align multiple modalities is to fine-tune the\ngenerator with a large number of annotated text-image pairs. However, such a\nprocedure is labor-consuming and resource-draining. The key question we ask is:\ncan we enhance the quality and faithfulness of text-driven generative models\nbeyond extensive text-image pair annotations? To address this question, we\npropose Knowledge Pursuit Prompting (KPP), a zero-shot framework that\niteratively incorporates external knowledge to help generators produce reliable\nvisual content. Instead of training generators to handle generic prompts, KPP\nemploys a recursive knowledge query process to gather informative external\nfacts from the knowledge base, instructs a language model to compress the\nacquired knowledge for prompt refinement, and utilizes text-driven generators\nfor visual synthesis. The entire process is zero-shot, without accessing the\narchitectures and parameters of generative models. We evaluate the framework\nacross multiple text-driven generative tasks (image, 3D rendering, and video)\non datasets of different domains. We further demonstrate the extensibility and\nadaptability of KPP through varying foundation model bases and instructions.\nOur results show that KPP is capable of generating faithful and semantically\nrich content across diverse visual domains, offering a promising solution to\nimprove multimodal generative models.\n","authors":["Jinqi Luo","Kwan Ho Ryan Chan","Dimitris Dimos","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2311.17898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18812v1","updated":"2023-11-30T18:53:13Z","published":"2023-11-30T18:53:13Z","title":"What Do Llamas Really Think? Revealing Preference Biases in Language\n Model Representations","summary":" Do large language models (LLMs) exhibit sociodemographic biases, even when\nthey decline to respond? To bypass their refusal to \"speak,\" we study this\nresearch question by probing contextualized embeddings and exploring whether\nthis bias is encoded in its latent representations. We propose a logistic\nBradley-Terry probe which predicts word pair preferences of LLMs from the\nwords' hidden vectors. We first validate our probe on three pair preference\ntasks and thirteen LLMs, where we outperform the word embedding association\ntest (WEAT), a standard approach in testing for implicit association, by a\nrelative 27% in error rate. We also find that word pair preferences are best\nrepresented in the middle layers. Next, we transfer probes trained on harmless\ntasks (e.g., pick the larger number) to controversial ones (compare\nethnicities) to examine biases in nationality, politics, religion, and gender.\nWe observe substantial bias for all target classes: for instance, the Mistral\nmodel implicitly prefers Europe to Africa, Christianity to Judaism, and\nleft-wing to right-wing politics, despite declining to answer. This suggests\nthat instruction fine-tuning does not necessarily debias contextualized\nembeddings. Our codebase is at https://github.com/castorini/biasprobe.\n","authors":["Raphael Tang","Xinyu Zhang","Jimmy Lin","Ferhan Ture"],"pdf_url":"https://arxiv.org/pdf/2311.18812v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.18805v1","updated":"2023-11-30T18:51:38Z","published":"2023-11-30T18:51:38Z","title":"Unnatural Error Correction: GPT-4 Can Almost Perfectly Handle Unnatural\n Scrambled Text","summary":" While Large Language Models (LLMs) have achieved remarkable performance in\nmany tasks, much about their inner workings remains unclear. In this study, we\npresent novel experimental insights into the resilience of LLMs, particularly\nGPT-4, when subjected to extensive character-level permutations. To investigate\nthis, we first propose the Scrambled Bench, a suite designed to measure the\ncapacity of LLMs to handle scrambled input, in terms of both recovering\nscrambled sentences and answering questions given scrambled context. The\nexperimental results indicate that most powerful LLMs demonstrate the\ncapability akin to typoglycemia, a phenomenon where humans can understand the\nmeaning of words even when the letters within those words are scrambled, as\nlong as the first and last letters remain in place. More surprisingly, we found\nthat only GPT-4 nearly flawlessly processes inputs with unnatural errors, even\nunder the extreme condition, a task that poses significant challenges for other\nLLMs and often even for humans. Specifically, GPT-4 can almost perfectly\nreconstruct the original sentences from scrambled ones, decreasing the edit\ndistance by 95%, even when all letters within each word are entirely scrambled.\nIt is counter-intuitive that LLMs can exhibit such resilience despite severe\ndisruption to input tokenization caused by scrambled text.\n","authors":["Qi Cao","Takeshi Kojima","Yutaka Matsuo","Yusuke Iwasawa"],"pdf_url":"https://arxiv.org/pdf/2311.18805v1.pdf","comment":"EMNLP 2023 (with an additional analysis section in appendix)"},{"id":"http://arxiv.org/abs/2311.18803v1","updated":"2023-11-30T18:49:43Z","published":"2023-11-30T18:49:43Z","title":"BIOCLIP: A Vision Foundation Model for the Tree of Life","summary":" Images of the natural world, collected by a variety of cameras, from drones\nto individual phones, are increasingly abundant sources of biological\ninformation. There is an explosion of computational methods and tools,\nparticularly computer vision, for extracting biologically relevant information\nfrom images for science and conservation. Yet most of these are bespoke\napproaches designed for a specific task and are not easily adaptable or\nextendable to new questions, contexts, and datasets. A vision model for general\norganismal biology questions on images is of timely need. To approach this, we\ncurate and release TreeOfLife-10M, the largest and most diverse ML-ready\ndataset of biology images. We then develop BioCLIP, a foundation model for the\ntree of life, leveraging the unique properties of biology captured by\nTreeOfLife-10M, namely the abundance and variety of images of plants, animals,\nand fungi, together with the availability of rich structured biological\nknowledge. We rigorously benchmark our approach on diverse fine-grained biology\nclassification tasks, and find that BioCLIP consistently and substantially\noutperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation\nreveals that BioCLIP has learned a hierarchical representation conforming to\nthe tree of life, shedding light on its strong generalizability. Our code,\nmodels and data will be made available at\nhttps://github.com/Imageomics/bioclip.\n","authors":["Samuel Stevens","Jiaman Wu","Matthew J Thompson","Elizabeth G Campolongo","Chan Hee Song","David Edward Carlyn","Li Dong","Wasila M Dahdul","Charles Stewart","Tanya Berger-Wolf","Wei-Lun Chao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2311.18803v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2311.14743v3","updated":"2023-11-30T18:48:24Z","published":"2023-11-21T18:41:26Z","title":"A Baseline Analysis of Reward Models' Ability To Accurately Analyze\n Foundation Models Under Distribution Shift","summary":" Foundation models, specifically Large Language Models (LLM's), have lately\ngained wide-spread attention and adoption. Reinforcement Learning with Human\nFeedback (RLHF) involves training a reward model to capture desired behaviors,\nwhich is then used to align an LLM. These reward models are additionally used\nat inference-time to estimate how well LLM responses adhere to those desired\nbehaviors. However, there is little work measuring how robust these reward\nmodels are to distribution shifts. In this work, we evaluate how reward model\nperformance - measured via accuracy and calibration (i.e. alignment between\naccuracy and confidence) - is affected by distribution shift. We show novel\ncalibration patterns and accuracy drops due to OOD prompts and responses, and\nthat the reward model is more sensitive to shifts in responses than prompts.\nAdditionally, we adapt an OOD detection technique commonly used in\nclassification to the reward model setting in order to detect these\ndistribution shifts in prompts and responses.\n","authors":["Ben Pikus","Will LeVine","Tony Chen","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2311.14743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18799v1","updated":"2023-11-30T18:43:51Z","published":"2023-11-30T18:43:51Z","title":"X-InstructBLIP: A Framework for aligning X-Modal instruction-aware\n representations to LLMs and Emergent Cross-modal Reasoning","summary":" Vision-language pre-training and instruction tuning have demonstrated\ngeneral-purpose capabilities in 2D visual reasoning tasks by aligning visual\nencoders with state-of-the-art large language models (LLMs). In this paper, we\nintroduce a simple, yet effective, cross-modality framework built atop frozen\nLLMs that allows the integration of various modalities without extensive\nmodality-specific customization. To facilitate instruction-modality\nfine-tuning, we collect high-quality instruction tuning data in an automatic\nand scalable manner, composed of 24K QA samples for audio and 250K QA samples\nfor 3D. Leveraging instruction-aware representations, our model performs\ncomparably with leading-edge counterparts without the need of extensive\nmodality-specific pre-training or customization. Furthermore, our approach\ndemonstrates cross-modal reasoning abilities across two or more input\nmodalities, despite each modality projection being trained individually. To\nstudy the model's cross-modal abilities, we contribute a novel Discriminative\nCross-modal Reasoning (DisCRn) evaluation task, comprising 9K audio-video QA\nsamples and 28K image-3D QA samples that require the model to reason\ndiscriminatively across disparate input modalities.\n","authors":["Artemis Panagopoulou","Le Xue","Ning Yu","Junnan Li","Dongxu Li","Shafiq Joty","Ran Xu","Silvio Savarese","Caiming Xiong","Juan Carlos Niebles"],"pdf_url":"https://arxiv.org/pdf/2311.18799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18778v1","updated":"2023-11-30T18:23:38Z","published":"2023-11-30T18:23:38Z","title":"Mavericks at BLP-2023 Task 1: Ensemble-based Approach Using Language\n Models for Violence Inciting Text Detection","summary":" This paper presents our work for the Violence Inciting Text Detection shared\ntask in the First Workshop on Bangla Language Processing. Social media has\naccelerated the propagation of hate and violence-inciting speech in society. It\nis essential to develop efficient mechanisms to detect and curb the propagation\nof such texts. The problem of detecting violence-inciting texts is further\nexacerbated in low-resource settings due to sparse research and less data. The\ndata provided in the shared task consists of texts in the Bangla language,\nwhere each example is classified into one of the three categories defined based\non the types of violence-inciting texts. We try and evaluate several BERT-based\nmodels, and then use an ensemble of the models as our final submission. Our\nsubmission is ranked 10th in the final leaderboard of the shared task with a\nmacro F1 score of 0.737.\n","authors":["Saurabh Page","Sudeep Mangalvedhekar","Kshitij Deshpande","Tanmay Chavan","Sheetal Sonawane"],"pdf_url":"https://arxiv.org/pdf/2311.18778v1.pdf","comment":"6 pages, 1 figure, accepted at the BLP Workshop, EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.18775v1","updated":"2023-11-30T18:21:25Z","published":"2023-11-30T18:21:25Z","title":"CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation","summary":" We present CoDi-2, a versatile and interactive Multimodal Large Language\nModel (MLLM) that can follow complex multimodal interleaved instructions,\nconduct in-context learning (ICL), reason, chat, edit, etc., in an any-to-any\ninput-output modality paradigm. By aligning modalities with language for both\nencoding and generation, CoDi-2 empowers Large Language Models (LLMs) to not\nonly understand complex modality-interleaved instructions and in-context\nexamples, but also autoregressively generate grounded and coherent multimodal\noutputs in the continuous feature space. To train CoDi-2, we build a\nlarge-scale generation dataset encompassing in-context multimodal instructions\nacross text, vision, and audio. CoDi-2 demonstrates a wide range of zero-shot\ncapabilities for multimodal generation, such as in-context learning, reasoning,\nand compositionality of any-to-any modality generation through multi-round\ninteractive conversation. CoDi-2 surpasses previous domain-specific models on\ntasks such as subject-driven image generation, vision transformation, and audio\nediting. CoDi-2 signifies a substantial breakthrough in developing a\ncomprehensive multimodal foundation model adept at interpreting in-context\nlanguage-vision-audio interleaved instructions and producing multimodal\noutputs.\n","authors":["Zineng Tang","Ziyi Yang","Mahmoud Khademi","Yang Liu","Chenguang Zhu","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2311.18775v1.pdf","comment":"Project Page: https://codi-2.github.io/"},{"id":"http://arxiv.org/abs/2310.16111v2","updated":"2023-11-30T18:13:01Z","published":"2023-10-24T18:25:13Z","title":"Locally Differentially Private Document Generation Using Zero Shot\n Prompting","summary":" Numerous studies have highlighted the privacy risks associated with\npretrained large language models. In contrast, our research offers a unique\nperspective by demonstrating that pretrained large language models can\neffectively contribute to privacy preservation. We propose a locally\ndifferentially private mechanism called DP-Prompt, which leverages the power of\npretrained large language models and zero-shot prompting to counter author\nde-anonymization attacks while minimizing the impact on downstream utility.\nWhen DP-Prompt is used with a powerful language model like ChatGPT (gpt-3.5),\nwe observe a notable reduction in the success rate of de-anonymization attacks,\nshowing that it surpasses existing approaches by a considerable margin despite\nits simpler design. For instance, in the case of the IMDB dataset, DP-Prompt\n(with ChatGPT) perfectly recovers the clean sentiment F1 score while achieving\na 46\\% reduction in author identification F1 score against static attackers and\na 26\\% reduction against adaptive attackers. We conduct extensive experiments\nacross six open-source large language models, ranging up to 7 billion\nparameters, to analyze various effects of the privacy-utility tradeoff.\n","authors":["Saiteja Utpala","Sara Hooker","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.16111v2.pdf","comment":"Accepted at EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2311.18765v1","updated":"2023-11-30T18:05:52Z","published":"2023-11-30T18:05:52Z","title":"MLLMs-Augmented Visual-Language Representation Learning","summary":" Visual-language pre-training (VLP) have achieved remarkable success in\nmulti-modal tasks, largely attributed to the availability of large-scale\nimage-text datasets. In this work, we demonstrate that multi-modal large\nlanguage models (MLLMs) can enhance visual-language representation learning by\nimproving data quality. Our approach is simple, utilizing MLLMs to extend\nmultiple captions for each image. To prevent the bias that introduced by MLLMs'\nhallucinations and intrinsic caption styles, we propose a \"text shearing\" to\nkeep the lengths of extended captions identical to the originals. In image-text\nretrieval, our method consistently obtains 5.6 ~ 35.0% and 16.8 ~ 46.1%\nimprovement on R@1 under the fine-tuning and zero-shot settings, respectively.\nNotably, our zero-shot results are comparable to fine-tuning on target\ndatasets, which encourages more exploration on the versatile use of MLLMs.\n","authors":["Yanqing Liu","Kai Wang","Wenqi Shao","Ping Luo","Yu Qiao","Mike Zheng Shou","Kaipeng Zhang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2311.18765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18761v1","updated":"2023-11-30T18:03:58Z","published":"2023-11-30T18:03:58Z","title":"Can training neural language models on a curriculum with developmentally\n plausible data improve alignment with human reading behavior?","summary":" The use of neural language models to model human behavior has met with mixed\nsuccess. While some work has found that the surprisal estimates from these\nmodels can be used to predict a wide range of human neural and behavioral\nresponses, other work studying more complex syntactic phenomena has found that\nthese surprisal estimates generate incorrect behavioral predictions. This paper\nexplores the extent to which the misalignment between empirical and\nmodel-predicted behavior can be minimized by training models on more\ndevelopmentally plausible data, such as in the BabyLM Challenge. We trained\nteacher language models on the BabyLM \"strict-small\" dataset and used sentence\nlevel surprisal estimates from these teacher models to create a curriculum. We\nfound tentative evidence that our curriculum made it easier for models to\nacquire linguistic knowledge from the training data: on the subset of tasks in\nthe BabyLM challenge suite evaluating models' grammatical knowledge of English,\nmodels first trained on the BabyLM data curriculum and then on a few randomly\nordered training epochs performed slightly better than models trained on\nrandomly ordered epochs alone. This improved linguistic knowledge acquisition\ndid not result in better alignment with human reading behavior, however: models\ntrained on the BabyLM dataset (with or without a curriculum) generated\npredictions that were as misaligned with human behavior as models trained on\nlarger less curated datasets. This suggests that training on developmentally\nplausible datasets alone is likely insufficient to generate language models\ncapable of accurately predicting human language processing.\n","authors":["Aryaman Chobey","Oliver Smith","Anzi Wang","Grusha Prasad"],"pdf_url":"https://arxiv.org/pdf/2311.18761v1.pdf","comment":"To appear in the proceedings of BabyLM shared task CoNLL 2023"},{"id":"http://arxiv.org/abs/2311.18760v1","updated":"2023-11-30T18:02:44Z","published":"2023-11-30T18:02:44Z","title":"TaskBench: Benchmarking Large Language Models for Task Automation","summary":" Recently, the incredible progress of large language models (LLMs) has ignited\nthe spark of task automation, which decomposes the complex tasks described by\nuser instructions into sub-tasks, and invokes external tools to execute them,\nand plays a central role in autonomous agents. However, there lacks a\nsystematic and standardized benchmark to foster the development of LLMs in task\nautomation. To this end, we introduce TaskBench to evaluate the capability of\nLLMs in task automation. Specifically, task automation can be formulated into\nthree critical stages: task decomposition, tool invocation, and parameter\nprediction to fulfill user intent. This complexity makes data collection and\nevaluation more challenging compared to common NLP tasks. To generate\nhigh-quality evaluation datasets, we introduce the concept of Tool Graph to\nrepresent the decomposed tasks in user intent, and adopt a back-instruct method\nto simulate user instruction and annotations. Furthermore, we propose TaskEval\nto evaluate the capability of LLMs from different aspects, including task\ndecomposition, tool invocation, and parameter prediction. Experimental results\ndemonstrate that TaskBench can effectively reflects the capability of LLMs in\ntask automation. Benefiting from the mixture of automated data construction and\nhuman verification, TaskBench achieves a high consistency compared to the human\nevaluation, which can be utilized as a comprehensive and faithful benchmark for\nLLM-based autonomous agents.\n","authors":["Yongliang Shen","Kaitao Song","Xu Tan","Wenqi Zhang","Kan Ren","Siyu Yuan","Weiming Lu","Dongsheng Li","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2311.18760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18751v1","updated":"2023-11-30T17:50:47Z","published":"2023-11-30T17:50:47Z","title":"Language Model Agents Suffer from Compositional Generalization in Web\n Automation","summary":" Language model agents (LMA) recently emerged as a promising paradigm on\nmuti-step decision making tasks, often outperforming humans and other\nreinforcement learning agents. Despite the promise, their performance on\nreal-world applications that often involve combinations of tasks is still\nunderexplored. In this work, we introduce a new benchmark, called CompWoB -- 50\nnew compositional web automation tasks reflecting more realistic assumptions.\nWe show that while existing prompted LMAs (gpt-3.5-turbo or gpt-4) achieve\n94.0% average success rate on base tasks, their performance degrades to 24.9%\nsuccess rate on compositional tasks. On the other hand, transferred LMAs\n(finetuned only on base tasks) show less generalization gap, dropping from\n85.4% to 54.8%. By balancing data distribution across tasks, we train a new\nmodel, HTML-T5++, that surpasses human-level performance (95.2%) on MiniWoB,\nand achieves the best zero-shot performance on CompWoB (61.5%). While these\nhighlight the promise of small-scale finetuned and transferred models for\ncompositional generalization, their performance further degrades under\ndifferent instruction compositions changing combinational order. In contrast to\nthe recent remarkable success of LMA, our benchmark and detailed analysis\nemphasize the necessity of building LMAs that are robust and generalizable to\ntask compositionality for real-world deployment.\n","authors":["Hiroki Furuta","Yutaka Matsuo","Aleksandra Faust","Izzeddin Gur"],"pdf_url":"https://arxiv.org/pdf/2311.18751v1.pdf","comment":"Code:\n https://github.com/google-research/google-research/tree/master/compositional_rl/compwob"},{"id":"http://arxiv.org/abs/2311.18743v1","updated":"2023-11-30T17:41:30Z","published":"2023-11-30T17:41:30Z","title":"AlignBench: Benchmarking Chinese Alignment of Large Language Models","summary":" Alignment has become a critical step for instruction-tuned Large Language\nModels (LLMs) to become helpful assistants. However, effective evaluation of\nalignment for emerging Chinese LLMs is still significantly lacking, calling for\nreal-scenario grounded, open-ended, challenging and automatic evaluations\ntailored for alignment. To fill in this gap, we introduce AlignBench, a\ncomprehensive multi-dimensional benchmark for evaluating LLMs' alignment in\nChinese. Equipped with a human-in-the-loop data curation pipeline, our\nbenchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with\nChain-of-Thought to generate explanations and final ratings as evaluations,\nensuring high reliability and interpretability. Furthermore, we developed a\ndedicated companion evaluator LLM -- CritiqueLLM, which recovers 95\\% of\nGPT-4's evaluation ability and will be provided via public APIs to researchers\nfor evaluation of alignment in Chinese LLMs. All evaluation codes, data, and\nLLM generations are available at \\url{https://github.com/THUDM/AlignBench}.\n","authors":["Xiao Liu","Xuanyu Lei","Shengyuan Wang","Yue Huang","Zhuoer Feng","Bosi Wen","Jiale Cheng","Pei Ke","Yifan Xu","Weng Lam Tam","Xiaohan Zhang","Lichao Sun","Hongning Wang","Jing Zhang","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2311.18743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18739v1","updated":"2023-11-30T17:37:56Z","published":"2023-11-30T17:37:56Z","title":"Mavericks at NADI 2023 Shared Task: Unravelling Regional Nuances through\n Dialect Identification using Transformer-based Approach","summary":" In this paper, we present our approach for the \"Nuanced Arabic Dialect\nIdentification (NADI) Shared Task 2023\". We highlight our methodology for\nsubtask 1 which deals with country-level dialect identification. Recognizing\ndialects plays an instrumental role in enhancing the performance of various\ndownstream NLP tasks such as speech recognition and translation. The task uses\nthe Twitter dataset (TWT-2023) that encompasses 18 dialects for the multi-class\nclassification problem. Numerous transformer-based models, pre-trained on\nArabic language, are employed for identifying country-level dialects. We\nfine-tune these state-of-the-art models on the provided dataset. The ensembling\nmethod is leveraged to yield improved performance of the system. We achieved an\nF1-score of 76.65 (11th rank on the leaderboard) on the test dataset.\n","authors":["Vedant Deshpande","Yash Patwardhan","Kshitij Deshpande","Sudeep Mangalvedhekar","Ravindra Murumkar"],"pdf_url":"https://arxiv.org/pdf/2311.18739v1.pdf","comment":"5 pages, 1 figure, accepted at the NADI ArabicNLP Workshop, EMNLP\n 2023"},{"id":"http://arxiv.org/abs/2311.18730v1","updated":"2023-11-30T17:26:57Z","published":"2023-11-30T17:26:57Z","title":"Mavericks at ArAIEval Shared Task: Towards a Safer Digital Space --\n Transformer Ensemble Models Tackling Deception and Persuasion","summary":" In this paper, we highlight our approach for the \"Arabic AI Tasks Evaluation\n(ArAiEval) Shared Task 2023\". We present our approaches for task 1-A and task\n2-A of the shared task which focus on persuasion technique detection and\ndisinformation detection respectively. Detection of persuasion techniques and\ndisinformation has become imperative to avoid distortion of authentic\ninformation. The tasks use multigenre snippets of tweets and news articles for\nthe given binary classification problem. We experiment with several\ntransformer-based models that were pre-trained on the Arabic language. We\nfine-tune these state-of-the-art models on the provided dataset. Ensembling is\nemployed to enhance the performance of the systems. We achieved a micro\nF1-score of 0.742 on task 1-A (8th rank on the leaderboard) and 0.901 on task\n2-A (7th rank on the leaderboard) respectively.\n","authors":["Sudeep Mangalvedhekar","Kshitij Deshpande","Yash Patwardhan","Vedant Deshpande","Ravindra Murumkar"],"pdf_url":"https://arxiv.org/pdf/2311.18730v1.pdf","comment":"6 pages, 1 figure, accepted at the ArAIEval ArabicNLP workshop, EMNLP\n conference 2023"},{"id":"http://arxiv.org/abs/2311.18727v1","updated":"2023-11-30T17:23:40Z","published":"2023-11-30T17:23:40Z","title":"Automatic Functional Differentiation in JAX","summary":" We extend JAX with the capability to automatically differentiate higher-order\nfunctions (functionals and operators). By representing functions as a\ngeneralization of arrays, we seamlessly use JAX's existing primitive system to\nimplement higher-order functions. We present a set of primitive operators that\nserve as foundational building blocks for constructing several key types of\nfunctionals. For every introduced primitive operator, we derive and implement\nboth linearization and transposition rules, aligning with JAX's internal\nprotocols for forward and reverse mode automatic differentiation. This\nenhancement allows for functional differentiation in the same syntax\ntraditionally use for functions. The resulting functional gradients are\nthemselves functions ready to be invoked in python. We showcase this tool's\nefficacy and simplicity through applications where functional derivatives are\nindispensable. The source code of this work is released at\nhttps://github.com/sail-sg/autofd .\n","authors":["Min Lin"],"pdf_url":"https://arxiv.org/pdf/2311.18727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03170v2","updated":"2023-11-30T17:15:34Z","published":"2023-07-06T17:52:10Z","title":"Focused Transformer: Contrastive Training for Context Scaling","summary":" Large language models have an exceptional capability to incorporate new\ninformation in a contextual manner. However, the full potential of such an\napproach is often restrained due to a limitation in the effective context\nlength. One solution to this issue is to endow an attention layer with access\nto an external memory, which comprises of (key, value) pairs. Yet, as the\nnumber of documents increases, the proportion of relevant keys to irrelevant\nones decreases, leading the model to focus more on the irrelevant keys. We\nidentify a significant challenge, dubbed the distraction issue, where keys\nlinked to different semantic values might overlap, making them hard to\ndistinguish. To tackle this problem, we introduce the Focused Transformer\n(FoT), a technique that employs a training process inspired by contrastive\nlearning. This novel approach enhances the structure of the (key, value) space,\nenabling an extension of the context length. Our method allows for fine-tuning\npre-existing, large-scale models to lengthen their effective context. This is\ndemonstrated by our fine-tuning of $3B$ and $7B$ OpenLLaMA checkpoints. The\nresulting models, which we name LongLLaMA, exhibit advancements in tasks\nrequiring a long context. We further illustrate that our LongLLaMA models\nadeptly manage a $256 k$ context length for passkey retrieval.\n","authors":["Szymon Tworkowski","Konrad Staniszewski","Mikołaj Pacek","Yuhuai Wu","Henryk Michalewski","Piotr Miłoś"],"pdf_url":"https://arxiv.org/pdf/2307.03170v2.pdf","comment":"Accepted at 37th Conference on Neural Information Processing Systems\n (NeurIPS 2023). 28 pages, 10 figures, 11 tables"},{"id":"http://arxiv.org/abs/2311.18712v1","updated":"2023-11-30T17:11:27Z","published":"2023-11-30T17:11:27Z","title":"CoRec: An Easy Approach for Coordination Recognition","summary":" In this paper, we observe and address the challenges of the coordination\nrecognition task. Most existing methods rely on syntactic parsers to identify\nthe coordinators in a sentence and detect the coordination boundaries. However,\nstate-of-the-art syntactic parsers are slow and suffer from errors, especially\nfor long and complicated sentences. To better solve the problems, we propose a\npipeline model COordination RECognizer (CoRec). It consists of two components:\ncoordinator identifier and conjunct boundary detector. The experimental results\non datasets from various domains demonstrate the effectiveness and efficiency\nof the proposed method. Further experiments show that CoRec positively impacts\ndownstream tasks, improving the yield of state-of-the-art Open IE models.\n","authors":["Qing Wang","Haojie Jia","Wenfei Song","Qi Li"],"pdf_url":"https://arxiv.org/pdf/2311.18712v1.pdf","comment":"Accepted by EMNLP 2023 Main Conference (oral presentation)"},{"id":"http://arxiv.org/abs/2311.18711v1","updated":"2023-11-30T17:06:00Z","published":"2023-11-30T17:06:00Z","title":"Women Are Beautiful, Men Are Leaders: Gender Stereotypes in Machine\n Translation and Language Modeling","summary":" We present GEST -- a new dataset for measuring gender-stereotypical reasoning\nin masked LMs and English-to-X machine translation systems. GEST contains\nsamples that are compatible with 9 Slavic languages and English for 16 gender\nstereotypes about men and women (e.g., Women are beautiful, Men are leaders).\nThe definition of said stereotypes was informed by gender experts. We used GEST\nto evaluate 11 masked LMs and 4 machine translation systems. We discovered\nsignificant and consistent amounts of stereotypical reasoning in almost all the\nevaluated models and languages.\n","authors":["Matúš Pikuliak","Andrea Hrckova","Stefan Oresko","Marián Šimko"],"pdf_url":"https://arxiv.org/pdf/2311.18711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18702v1","updated":"2023-11-30T16:52:42Z","published":"2023-11-30T16:52:42Z","title":"CritiqueLLM: Scaling LLM-as-Critic for Effective and Explainable\n Evaluation of Large Language Model Generation","summary":" Since the natural language processing (NLP) community started to make large\nlanguage models (LLMs), such as GPT-4, act as a critic to evaluate the quality\nof generated texts, most of them only train a critique generation model of a\nspecific scale on specific datasets. We argue that a comprehensive\ninvestigation on the key factor of LLM-based evaluation models, such as scaling\nproperties, is lacking, so that it is still inconclusive whether these models\nhave potential to replace GPT-4's evaluation in practical scenarios. In this\npaper, we propose a new critique generation model called CritiqueLLM, which\nincludes a dialogue-based prompting method for high-quality referenced /\nreference-free evaluation data. Experimental results show that our model can\nachieve comparable evaluation performance to GPT-4 especially in system-level\ncorrelations, and even outperform GPT-4 in 3 out of 8 tasks in a challenging\nreference-free setting. We conduct detailed analysis to show promising scaling\nproperties of our model in the quality of generated critiques. We also\ndemonstrate that our generated critiques can act as scalable feedback to\ndirectly improve the generation quality of LLMs.\n","authors":["Pei Ke","Bosi Wen","Zhuoer Feng","Xiao Liu","Xuanyu Lei","Jiale Cheng","Shengyuan Wang","Aohan Zeng","Yuxiao Dong","Hongning Wang","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2311.18702v1.pdf","comment":"18 pages, 5 figures"},{"id":"http://arxiv.org/abs/2304.09991v3","updated":"2023-11-30T16:30:09Z","published":"2023-04-19T21:59:04Z","title":"Supporting Human-AI Collaboration in Auditing LLMs with LLMs","summary":" Large language models are becoming increasingly pervasive and ubiquitous in\nsociety via deployment in sociotechnical systems. Yet these language models, be\nit for classification or generation, have been shown to be biased and behave\nirresponsibly, causing harm to people at scale. It is crucial to audit these\nlanguage models rigorously. Existing auditing tools leverage either or both\nhumans and AI to find failures. In this work, we draw upon literature in\nhuman-AI collaboration and sensemaking, and conduct interviews with research\nexperts in safe and fair AI, to build upon the auditing tool: AdaTest (Ribeiro\nand Lundberg, 2022), which is powered by a generative large language model\n(LLM). Through the design process we highlight the importance of sensemaking\nand human-AI communication to leverage complementary strengths of humans and\ngenerative models in collaborative auditing. To evaluate the effectiveness of\nthe augmented tool, AdaTest++, we conduct user studies with participants\nauditing two commercial language models: OpenAI's GPT-3 and Azure's sentiment\nanalysis model. Qualitative analysis shows that AdaTest++ effectively leverages\nhuman strengths such as schematization, hypothesis formation and testing.\nFurther, with our tool, participants identified a variety of failures modes,\ncovering 26 different topics over 2 tasks, that have been shown before in\nformal audits and also those previously under-reported.\n","authors":["Charvi Rastogi","Marco Tulio Ribeiro","Nicholas King","Harsha Nori","Saleema Amershi"],"pdf_url":"https://arxiv.org/pdf/2304.09991v3.pdf","comment":"21 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.18681v1","updated":"2023-11-30T16:28:40Z","published":"2023-11-30T16:28:40Z","title":"RaDialog: A Large Vision-Language Model for Radiology Report Generation\n and Conversational Assistance","summary":" Conversational AI tools that can generate and discuss clinically correct\nradiology reports for a given medical image have the potential to transform\nradiology. Such a human-in-the-loop radiology assistant could facilitate a\ncollaborative diagnostic process, thus saving time and improving the quality of\nreports. Towards this goal, we introduce RaDialog, the first thoroughly\nevaluated and publicly available large vision-language model for radiology\nreport generation and interactive dialog. RaDialog effectively integrates\nvisual image features and structured pathology findings with a large language\nmodel (LLM) while simultaneously adapting it to a specialized domain using\nparameter-efficient fine-tuning. To keep the conversational abilities of the\nunderlying LLM, we propose a comprehensive, semi-automatically labeled,\nimage-grounded instruct dataset for chest X-ray radiology tasks. By training\nwith this dataset, our method achieves state-of-the-art clinical correctness in\nreport generation and shows impressive abilities in interactive tasks such as\ncorrecting reports and answering questions, serving as a foundational step\ntoward clinical dialog systems. Our code is available on github:\nhttps://github.com/ChantalMP/RaDialog.\n","authors":["Chantal Pellegrini","Ege Özsoy","Benjamin Busam","Nassir Navab","Matthias Keicher"],"pdf_url":"https://arxiv.org/pdf/2311.18681v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2305.18498v2","updated":"2023-11-30T16:08:54Z","published":"2023-05-29T14:19:40Z","title":"ANPL: Towards Natural Programming with Interactive Decomposition","summary":" Though LLMs are capable of generating plausible programs, it's challenging to\ninteract with the LLMs further to revise the program, especially if the user's\nspecific requirements are different from the initial proposal. In this paper,\nwe introduce ANPL, an interactive programming system that ensures users can\nalways refine the generated code towards their specific programmatic intents\nvia structured decompositions. Borrowing the paradigm of sketching from program\nsynthesis, an ANPL program consists of a set of input-outputs that it must\nsatisfy, a ``sketch'' -- control/data flow expressed in precise code (e.g.\nPython), and ``holes'' -- sub-modules to be implemented by the LLM specified\nwith natural language. The user revises an ANPL program by either modifying the\nsketch, changing the language used to describe the holes, or providing\nadditional input-outputs to a particular hole, turning it into a sub-ANPL\nprogram that can be solved recursively. This workflow allows the users to\noffload programming burdens to the LLM as much as possible while retaining the\nability to pinpoint and resolve bugs locally, without exposing the rest of the\nprogram to the LLM. We deploy ANPL on the Abstraction and Reasoning Corpus\n(ARC), a set of unique tasks that are challenging for state-of-the-art AI\nsystems, showing it outperforms baseline programming systems that (a) without\nthe ability to decompose tasks interactively and (b) without the guarantee that\nthe modules can be correctly composed together. Additional evaluations on APPS,\nHumanEval, and real-world programming tasks have validated that the ANPL\nframework is applicable to multiple programming domains. We release the ANPL\nsolutions to the ARC tasks as a dataset, providing insights into how humans\ndecompose novel tasks programmatically. See our code at\nhttps://iprc-dip.github.io/ANPL/.\n","authors":["Di Huang","Ziyuan Nan","Xing Hu","Pengwei Jin","Shaohui Peng","Yuanbo Wen","Rui Zhang","Zidong Du","Qi Guo","Yewen Pu","Yunji Chen"],"pdf_url":"https://arxiv.org/pdf/2305.18498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18658v1","updated":"2023-11-30T16:08:04Z","published":"2023-11-30T16:08:04Z","title":"ArcMMLU: A Library and Information Science Benchmark for Large Language\n Models","summary":" In light of the rapidly evolving capabilities of large language models\n(LLMs), it becomes imperative to develop rigorous domain-specific evaluation\nbenchmarks to accurately assess their capabilities. In response to this need,\nthis paper introduces ArcMMLU, a specialized benchmark tailored for the Library\n& Information Science (LIS) domain in Chinese. This benchmark aims to measure\nthe knowledge and reasoning capability of LLMs within four key sub-domains:\nArchival Science, Data Science, Library Science, and Information Science.\nFollowing the format of MMLU/CMMLU, we collected over 6,000 high-quality\nquestions for the compilation of ArcMMLU. This extensive compilation can\nreflect the diverse nature of the LIS domain and offer a robust foundation for\nLLM evaluation. Our comprehensive evaluation reveals that while most mainstream\nLLMs achieve an average accuracy rate above 50% on ArcMMLU, there remains a\nnotable performance gap, suggesting substantial headroom for refinement in LLM\ncapabilities within the LIS domain. Further analysis explores the effectiveness\nof few-shot examples on model performance and highlights challenging questions\nwhere models consistently underperform, providing valuable insights for\ntargeted improvements. ArcMMLU fills a critical gap in LLM evaluations within\nthe Chinese LIS domain and paves the way for future development of LLMs\ntailored to this specialized area.\n","authors":["Shitou Zhang","Zuchao Li","Xingshen Liu","Liming Yang","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08018v4","updated":"2023-11-30T15:29:58Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a comprehensive instruction dataset\ndesigned for the biomolecular domain. Mol-Instructions encompasses three key\ncomponents: molecule-oriented instructions, protein-oriented instructions, and\nbiomolecular text instructions. Each component aims to improve the\nunderstanding and prediction capabilities of LLMs concerning biomolecular\nfeatures and behaviors. Through extensive instruction tuning experiments on\nLLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large\nmodels' performance in the intricate realm of biomolecular studies, thus\nfostering progress in the biomolecular research community. Mol-Instructions is\npublicly available for ongoing research and will undergo regular updates to\nenhance its applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v4.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions, add\n more experiments"},{"id":"http://arxiv.org/abs/2311.18609v1","updated":"2023-11-30T15:06:50Z","published":"2023-11-30T15:06:50Z","title":"ArthModel: Enhance Arithmetic Skills to Large Language Model","summary":" With the great success of ChatGPT, the research of large language models has\nbecome increasingly popular. However, the models have several limitations, such\nas toxicity and pool performance of arithmetic solving. Meanwhile, LLM may have\nsome potential abilities that have yet to be exploited. In this paper, we\nchoose a different way to enhance the arithmetic ability of LLM. We propose to\ntrain LLM to generate a postfix expression related to the arithmetic problem\nand incorporate it with small pretrained models. Moreover, this small model\ntransfers the token embeddings into real dense numbers and invokes native\nfunctions of a deep learning platform to get the correct answer. To generate\nthe final result, we propose prompt injection for adding the result outputs by\nthe small model to LLM. This work provides different ways of thinking, training\nand using a language model. The codes and models will be released at\n\\url{https://github.com/eteced/arithmetic_finetuning_v1}.\n","authors":["Yingdi Guo"],"pdf_url":"https://arxiv.org/pdf/2311.18609v1.pdf","comment":"7 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2311.18580v1","updated":"2023-11-30T14:18:47Z","published":"2023-11-30T14:18:47Z","title":"FFT: Towards Harmlessness Evaluation and Analysis for LLMs with\n Factuality, Fairness, Toxicity","summary":" The widespread of generative artificial intelligence has heightened concerns\nabout the potential harms posed by AI-generated texts, primarily stemming from\nfactoid, unfair, and toxic content. Previous researchers have invested much\neffort in assessing the harmlessness of generative language models. However,\nexisting benchmarks are struggling in the era of large language models (LLMs),\ndue to the stronger language generation and instruction following capabilities,\nas well as wider applications. In this paper, we propose FFT, a new benchmark\nwith 2116 elaborated-designed instances, for LLM harmlessness evaluation with\nfactuality, fairness, and toxicity. To investigate the potential harms of LLMs,\nwe evaluate 9 representative LLMs covering various parameter scales, training\nstages, and creators. Experiments show that the harmlessness of LLMs is still\nunder-satisfactory, and extensive analysis derives some insightful findings\nthat could inspire future research for harmless LLM research.\n","authors":["Shiyao Cui","Zhenyu Zhang","Yilong Chen","Wenyuan Zhang","Tianyun Liu","Siqi Wang","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18580v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.14993v2","updated":"2023-11-30T14:14:31Z","published":"2023-05-24T10:29:45Z","title":"Controlling Pre-trained Language Models for Grade-Specific Text\n Simplification","summary":" Text simplification (TS) systems rewrite text to make it more readable while\npreserving its content. However, what makes a text easy to read depends on the\nintended readers. Recent work has shown that pre-trained language models can\nsimplify text using a wealth of techniques to control output simplicity,\nranging from specifying only the desired reading grade level, to directly\nspecifying low-level edit operations. Yet it remains unclear how to set these\ncontrol parameters in practice. Existing approaches set them at the corpus\nlevel, disregarding the complexity of individual inputs and considering only\none level of output complexity. In this work, we conduct an empirical study to\nunderstand how different control mechanisms impact the adequacy and simplicity\nof text simplification systems. Based on these insights, we introduce a simple\nmethod that predicts the edit operations required for simplifying a text for a\nspecific grade level on an instance-per-instance basis. This approach improves\nthe quality of the simplified outputs over corpus-level search-based\nheuristics.\n","authors":["Sweta Agrawal","Marine Carpuat"],"pdf_url":"https://arxiv.org/pdf/2305.14993v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.18567v1","updated":"2023-11-30T13:58:13Z","published":"2023-11-30T13:58:13Z","title":"Grammatical Gender's Influence on Distributional Semantics: A Causal\n Perspective","summary":" How much meaning influences gender assignment across languages is an active\narea of research in modern linguistics and cognitive science. We can view\ncurrent approaches as aiming to determine where gender assignment falls on a\nspectrum, from being fully arbitrarily determined to being largely semantically\ndetermined. For the latter case, there is a formulation of the neo-Whorfian\nhypothesis, which claims that even inanimate noun gender influences how people\nconceive of and talk about objects (using the choice of adjective used to\nmodify inanimate nouns as a proxy for meaning). We offer a novel, causal\ngraphical model that jointly represents the interactions between a noun's\ngrammatical gender, its meaning, and adjective choice. In accordance with past\nresults, we find a relationship between the gender of nouns and the adjectives\nwhich modify them. However, when we control for the meaning of the noun, we\nfind that grammatical gender has a near-zero effect on adjective choice,\nthereby calling the neo-Whorfian hypothesis into question.\n","authors":["Karolina Stańczak","Kevin Du","Adina Williams","Isabelle Augenstein","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2311.18567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18481v1","updated":"2023-11-30T11:47:50Z","published":"2023-11-30T11:47:50Z","title":"ESG Accountability Made Easy: DocQA at Your Service","summary":" We present Deep Search DocQA. This application enables information extraction\nfrom documents via a question-answering conversational assistant. The system\nintegrates several technologies from different AI disciplines consisting of\ndocument conversion to machine-readable format (via computer vision), finding\nrelevant data (via natural language processing), and formulating an eloquent\nresponse (via large language models). Users can explore over 10,000\nEnvironmental, Social, and Governance (ESG) disclosure reports from over 2000\ncorporations. The Deep Search platform can be accessed at:\nhttps://ds4sd.github.io.\n","authors":["Lokesh Mishra","Cesar Berrospi","Kasper Dinkla","Diego Antognini","Francesco Fusco","Benedikt Bothur","Maksym Lysak","Nikolaos Livathinos","Ahmed Nassar","Panagiotis Vagenas","Lucas Morin","Christoph Auer","Michele Dolfi","Peter Staar"],"pdf_url":"https://arxiv.org/pdf/2311.18481v1.pdf","comment":"Accepted at the Demonstration Track of the 38th Annual AAAI\n Conference on Artificial Intelligence (AAAI 24)"},{"id":"http://arxiv.org/abs/2111.12727v3","updated":"2023-11-30T11:47:36Z","published":"2021-11-24T19:00:05Z","title":"Generating More Pertinent Captions by Leveraging Semantics and Style on\n Multi-Source Datasets","summary":" This paper addresses the task of generating fluent descriptions by training\non a non-uniform combination of data sources, containing both human-annotated\nand web-collected captions. Large-scale datasets with noisy image-text pairs,\nindeed, provide a sub-optimal source of supervision because of their\nlow-quality descriptive style, while human-annotated datasets are cleaner but\nsmaller in scale. To get the best of both worlds, we propose to leverage and\nseparate semantics and descriptive style through the incorporation of a style\ntoken and keywords extracted through a retrieval component. The proposed model\navoids the need of object detectors, is trained with a single objective of\nprompt language modeling, and can replicate the style of human-collected\ncaptions while training on sources with different input styles. Experimentally,\nthe model shows a strong capability of recognizing real-world concepts and\nproducing high-quality captions. Extensive experiments are performed on\ndifferent image captioning datasets, including CC3M, nocaps, and the\ncompetitive COCO dataset, where our model consistently outperforms baselines\nand state-of-the-art approaches.\n","authors":["Marcella Cornia","Lorenzo Baraldi","Giuseppe Fiameni","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2111.12727v3.pdf","comment":"Accepted to IJCV"},{"id":"http://arxiv.org/abs/2311.18466v1","updated":"2023-11-30T11:18:45Z","published":"2023-11-30T11:18:45Z","title":"Use of explicit replies as coordination mechanisms in online student\n debate","summary":" People in conversation entrain their linguistic behaviours through\nspontaneous alignment mechanisms [7] - both in face-to-face and\ncomputer-mediated communication (CMC) [8]. In CMC, one of the mechanisms\nthrough which linguistic entrainment happens is through explicit replies.\nIndeed, the use of explicit replies influences the structure of conversations,\nfavouring the formation of reply-trees typically delineated by topic shifts\n[5]. The interpersonal coordination mechanisms realized by how actors address\neach other have been studied using a probabilistic framework proposed by David\nGibson [2,3]. Other recent approaches use computational methods and information\ntheory to quantify changes in text. We explore coordination mechanisms\nconcerned with some of the roles utterances play in dialogues - specifically in\nexplicit replies. We identify these roles by finding community structure in the\nconversation's vocabulary using a non-parametric, hierarchical topic model.\nSome conversations may always stay on the ground, remaining at the level of\ngeneral introductory chatter. Some others may develop a specific sub-topic in\nsignificant depth and detail. Even others may jump between general chatter,\nout-of-topic remarks and people agreeing or disagreeing without further\nelaboration.\n","authors":["Bruno D. Ferreira-Saraiva","Joao P. Matos-Carvalho","Manuel Pita"],"pdf_url":"https://arxiv.org/pdf/2311.18466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10696v2","updated":"2023-11-30T10:42:26Z","published":"2022-12-21T00:00:01Z","title":"Analyzing Semantic Faithfulness of Language Models via Input\n Intervention on Question Answering","summary":" Transformer-based language models have been shown to be highly effective for\nseveral NLP tasks. In this paper, we consider three transformer models, BERT,\nRoBERTa, and XLNet, in both small and large versions, and investigate how\nfaithful their representations are with respect to the semantic content of\ntexts. We formalize a notion of semantic faithfulness, in which the semantic\ncontent of a text should causally figure in a model's inferences in question\nanswering. We then test this notion by observing a model's behavior on\nanswering questions about a story after performing two novel semantic\ninterventions: deletion intervention and negation intervention. While\ntransformer models achieve high performance on standard question answering\ntasks, we show that they fail to be semantically faithful once we perform these\ninterventions for a significant number of cases (~50% for deletion\nintervention, and ~20% drop in accuracy for negation intervention). We then\npropose an intervention-based training regime that can mitigate the undesirable\neffects for deletion intervention by a significant margin (from ~ 50% to ~6%).\nWe analyze the inner-workings of the models to better understand the\neffectiveness of intervention-based training for deletion intervention. But we\nshow that this training does not attenuate other aspects of semantic\nunfaithfulness such as the models' inability to deal with negation intervention\nor to capture the predicate-argument structure of texts. We also test\nInstructGPT, via prompting, for its ability to handle the two interventions and\nto capture predicate-argument structure. While InstructGPT models do achieve\nvery high performance on predicate-argument structure task, they fail to\nrespond adequately to our deletion and negation interventions.\n","authors":["Akshay Chaturvedi","Swarnadeep Bhar","Soumadeep Saha","Utpal Garain","Nicholas Asher"],"pdf_url":"https://arxiv.org/pdf/2212.10696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05950v3","updated":"2023-11-30T10:35:40Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities on downstream tasks when fine-tuned with\nminimal data. However, many VLMs rely on proprietary data and are not\nopen-source, which restricts the use of white-box approaches for fine-tuning.\nAs such, we aim to develop a black-box approach to optimize VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or even output logits. We propose employing chat-based LLMs\nto search for the best text prompt for VLMs. Specifically, we adopt an\nautomatic hill-climbing procedure that converges to an effective prompt by\nevaluating the performance of current prompts and asking LLMs to refine them\nbased on textual feedback, all within a conversational process without\nhuman-in-the-loop. In a challenging 1-shot image classification setup, our\nsimple approach surpasses the white-box continuous prompting method (CoOp) by\nan average of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms both human-engineered and LLM-generated prompts. We highlight the\nadvantage of conversational feedback that incorporates both positive and\nnegative prompts, suggesting that LLMs can utilize the implicit gradient\ndirection in textual feedback for a more efficient search. In addition, we find\nthat the text prompts generated through our strategy are not only more\ninterpretable but also transfer well across different VLM architectures in a\nblack-box manner. Lastly, we demonstrate our framework on a state-of-the-art\nblack-box VLM (DALL-E 3) for text-to-image optimization.\n","authors":["Shihong Liu","Zhiqiu Lin","Samuel Yu","Ryan Lee","Tiffany Ling","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v3.pdf","comment":"Project site: llm-can-optimize-vlm.github.io"},{"id":"http://arxiv.org/abs/1808.09401v2","updated":"2023-11-30T09:58:01Z","published":"2018-08-28T16:46:30Z","title":"Temporal Information Extraction by Predicting Relative Time-lines","summary":" The current leading paradigm for temporal information extraction from text\nconsists of three phases: (1) recognition of events and temporal expressions,\n(2) recognition of temporal relations among them, and (3) time-line\nconstruction from the temporal relations. In contrast to the first two phases,\nthe last phase, time-line construction, received little attention and is the\nfocus of this work. In this paper, we propose a new method to construct a\nlinear time-line from a set of (extracted) temporal relations. But more\nimportantly, we propose a novel paradigm in which we directly predict start and\nend-points for events from the text, constituting a time-line without going\nthrough the intermediate step of prediction of temporal relations as in earlier\nwork. Within this paradigm, we propose two models that predict in linear\ncomplexity, and a new training loss using TimeML-style annotations, yielding\npromising results.\n","authors":["Artuur Leeuwenberg","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/1808.09401v2.pdf","comment":"Accepted at the Conference on Empirical Methods in Natural Language\n Processing (EMNLP 2018). Small correction in Eq. 6 on 30 Nov. 2023"},{"id":"http://arxiv.org/abs/2311.18397v1","updated":"2023-11-30T09:48:51Z","published":"2023-11-30T09:48:51Z","title":"IAG: Induction-Augmented Generation Framework for Answering Reasoning\n Questions","summary":" Retrieval-Augmented Generation (RAG), by incorporating external knowledge\nwith parametric memory of language models, has become the state-of-the-art\narchitecture for open-domain QA tasks. However, common knowledge bases are\ninherently constrained by limited coverage and noisy information, making\nretrieval-based approaches inadequate to answer implicit reasoning questions.\nIn this paper, we propose an Induction-Augmented Generation (IAG) framework\nthat utilizes inductive knowledge along with the retrieved documents for\nimplicit reasoning. We leverage large language models (LLMs) for deriving such\nknowledge via a novel prompting method based on inductive reasoning patterns.\nOn top of this, we implement two versions of IAG named IAG-GPT and IAG-Student,\nrespectively. IAG-GPT directly utilizes the knowledge generated by GPT-3 for\nanswer prediction, while IAG-Student gets rid of dependencies on GPT service at\ninference time by incorporating a student inductor model. The inductor is\nfirstly trained via knowledge distillation and further optimized by\nback-propagating the generator feedback via differentiable beam scores.\nExperimental results show that IAG outperforms RAG baselines as well as ChatGPT\non two Open-Domain QA tasks. Notably, our best models have won the first place\nin the official leaderboards of CSQA2.0 (since Nov 1, 2022) and StrategyQA\n(since Jan 8, 2023).\n","authors":["Zhebin Zhang","Xinyu Zhang","Yuanhang Ren","Saijiang Shi","Meng Han","Yongkang Wu","Ruofei Lai","Zhao Cao"],"pdf_url":"https://arxiv.org/pdf/2311.18397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10570v2","updated":"2023-11-30T09:37:20Z","published":"2023-10-16T16:45:12Z","title":"On Context Utilization in Summarization with Large Language Models","summary":" Large language models (LLMs) excel in zero-shot abstractive summarization\ntasks, delivering fluent and pertinent summaries. Recent advancements have\nextended their capabilities to handle long-input contexts, surpassing token\nlimits of 100k. However, in the realm of multi-document question answering,\nlanguage models exhibit uneven utilization of their input context. They tend to\nfavor the initial and final segments, resulting in a U-shaped performance\npattern concerning where the answer is located within the input. This bias\nraises concerns, particularly in summarization tasks where crucial content may\nbe dispersed throughout the source document(s). This paper presents a\ncomprehensive investigation encompassing 10 datasets, 5 LLMs, and 5 evaluation\nmetrics to analyze how these models leverage their input for abstractive\nsummarization. Our findings reveal a pronounced bias towards the introductory\ncontent (and to a lesser extent, the final content), posing challenges for LLM\nperformance across a range of diverse summarization benchmarks.\n","authors":["Mathieu Ravaut","Shafiq Joty","Aixin Sun","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2310.10570v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2311.18364v1","updated":"2023-11-30T09:03:49Z","published":"2023-11-30T09:03:49Z","title":"Hubness Reduction Improves Sentence-BERT Semantic Spaces","summary":" Semantic representations of text, i.e. representations of natural language\nwhich capture meaning by geometry, are essential for areas such as information\nretrieval and document grouping. High-dimensional trained dense vectors have\nreceived much attention in recent years as such representations. We investigate\nthe structure of semantic spaces that arise from embeddings made with\nSentence-BERT and find that the representations suffer from a well-known\nproblem in high dimensions called hubness. Hubness results in asymmetric\nneighborhood relations, such that some texts (the hubs) are neighbours of many\nother texts while most texts (so-called anti-hubs), are neighbours of few or no\nother texts. We quantify the semantic quality of the embeddings using hubness\nscores and error rate of a neighbourhood based classifier. We find that when\nhubness is high, we can reduce error rate and hubness using hubness reduction\nmethods. We identify a combination of two methods as resulting in the best\nreduction. For example, on one of the tested pretrained models, this combined\nmethod can reduce hubness by about 75% and error rate by about 9%. Thus, we\nargue that mitigating hubness in the embedding space provides better semantic\nrepresentations of text.\n","authors":["Beatrix M. G. Nielsen","Lars Kai Hansen"],"pdf_url":"https://arxiv.org/pdf/2311.18364v1.pdf","comment":"Accepted at NLDL 2024"},{"id":"http://arxiv.org/abs/2310.03262v2","updated":"2023-11-30T08:58:50Z","published":"2023-10-05T02:35:00Z","title":"Predicting Emergent Abilities with Infinite Resolution Evaluation","summary":" The scientific scale-up of large language models (LLMs) necessitates a\ncomprehensive understanding of their scaling properties. However, the existing\nliterature on the scaling properties only yields an incomplete answer:\noptimization loss decreases predictably as the model size increases, in line\nwith established scaling law; yet no scaling law for task has been established\nand the task performances are far from predictable during scaling. Task\nperformances typically show minor gains on small models until they improve\ndramatically once models exceed a size threshold, exemplifying the ``emergent\nabilities''. In this study, we discover that small models, although they\nexhibit minor performance, demonstrate critical and consistent task performance\nimprovements that are not captured by conventional evaluation strategies due to\ninsufficient measurement resolution. To measure such improvements, we introduce\nPassUntil, an evaluation strategy with theoretically infinite resolution,\nthrough massive sampling in the decoding phase. With PassUntil, we conduct a\nquantitative investigation into the scaling law of task performance. The\ninvestigation contains two parts. Firstly, a strict task scaling law that is\nnot conventionally known to exist, is identified, enhancing the predictability\nof task performances. Remarkably, we are able to predict the performance of the\n2.4B model on code generation with merely 0.05\\% deviation before training\nstarts, which is the first systematic attempt to verify predictable scaling\nproposed by GPT-4's report. Secondly, we are able to study emergent abilities\nquantitatively. We identify a kind of accelerated emergence whose scaling curve\ncannot be fitted by standard scaling law function and has a increasing speed.\nWe then examine two hypothesis and imply that the ``multiple circuits\nhypothesis'' might be responsible for the accelerated emergence.\n","authors":["Shengding Hu","Xin Liu","Xu Han","Xinrong Zhang","Chaoqun He","Weilin Zhao","Yankai Lin","Ning Ding","Zebin Ou","Guoyang Zeng","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2310.03262v2.pdf","comment":"After revision"},{"id":"http://arxiv.org/abs/2305.13172v3","updated":"2023-11-30T08:55:24Z","published":"2023-05-22T16:00:00Z","title":"Editing Large Language Models: Problems, Methods, and Opportunities","summary":" Despite the ability to train capable LLMs, the methodology for maintaining\ntheir relevancy and rectifying errors remains elusive. To this end, the past\nfew years have witnessed a surge in techniques for editing LLMs, the objective\nof which is to efficiently alter the behavior of LLMs within a specific domain\nwithout negatively impacting performance across other inputs. This paper\nembarks on a deep exploration of the problems, methods, and opportunities\nrelated to model editing for LLMs. In particular, we provide an exhaustive\noverview of the task definition and challenges associated with model editing,\nalong with an in-depth empirical analysis of the most progressive methods\ncurrently at our disposal. We also build a new benchmark dataset to facilitate\na more robust evaluation and pinpoint enduring issues intrinsic to existing\ntechniques. Our objective is to provide valuable insights into the\neffectiveness and feasibility of each editing technique, thereby assisting the\ncommunity in making informed decisions on the selection of the most appropriate\nmethod for a specific task or context. Code and datasets are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Yunzhi Yao","Peng Wang","Bozhong Tian","Siyuan Cheng","Zhoubo Li","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13172v3.pdf","comment":"EMNLP 2023. Updated with new experiments"},{"id":"http://arxiv.org/abs/2311.18353v1","updated":"2023-11-30T08:44:55Z","published":"2023-11-30T08:44:55Z","title":"Evaluating the Rationale Understanding of Critical Reasoning in Logical\n Reading Comprehension","summary":" To precisely evaluate a language model's capability for logical reading\ncomprehension, we present a dataset for testing the understanding of the\nrationale behind critical reasoning. For questions taken from an existing\nmultiplechoice logical reading comprehension dataset, we crowdsource rationale\ntexts that explain why we should select or eliminate answer options, resulting\nin 3,003 multiple-choice subquestions that are associated with 943 main\nquestions. Experiments on our dataset show that recent large language models\n(e.g., InstructGPT) struggle to answer the subquestions even if they are able\nto answer the main questions correctly. We find that the models perform\nparticularly poorly in answering subquestions written for the incorrect options\nof the main questions, implying that the models have a limited capability for\nexplaining why incorrect alternatives should be eliminated. These results\nsuggest that our dataset encourages further investigation into the critical\nreasoning ability of language models while focusing on the elimination process\nof relevant alternatives.\n","authors":["Akira Kawabata","Saku Sugawara"],"pdf_url":"https://arxiv.org/pdf/2311.18353v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.17940v4","updated":"2023-11-30T08:26:16Z","published":"2023-10-27T07:34:51Z","title":"Unified Segment-to-Segment Framework for Simultaneous Sequence\n Generation","summary":" Simultaneous sequence generation is a pivotal task for real-time scenarios,\nsuch as streaming speech recognition, simultaneous machine translation and\nsimultaneous speech translation, where the target sequence is generated while\nreceiving the source sequence. The crux of achieving high-quality generation\nwith low latency lies in identifying the optimal moments for generating,\naccomplished by learning a mapping between the source and target sequences.\nHowever, existing methods often rely on task-specific heuristics for different\nsequence types, limiting the model's capacity to adaptively learn the\nsource-target mapping and hindering the exploration of multi-task learning for\nvarious simultaneous tasks. In this paper, we propose a unified\nsegment-to-segment framework (Seg2Seg) for simultaneous sequence generation,\nwhich learns the mapping in an adaptive and unified manner. During the process\nof simultaneous generation, the model alternates between waiting for a source\nsegment and generating a target segment, making the segment serve as the\nnatural bridge between the source and target. To accomplish this, Seg2Seg\nintroduces a latent segment as the pivot between source to target and explores\nall potential source-target mappings via the proposed expectation training,\nthereby learning the optimal moments for generating. Experiments on multiple\nsimultaneous generation tasks demonstrate that Seg2Seg achieves\nstate-of-the-art performance and exhibits better generality across various\ntasks.\n","authors":["Shaolei Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2310.17940v4.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.16573v2","updated":"2023-11-30T07:59:26Z","published":"2023-09-28T16:29:52Z","title":"Language Models as a Service: Overview of a New Paradigm and its\n Challenges","summary":" Some of the most powerful language models currently are proprietary systems,\naccessible only via (typically restrictive) web or software programming\ninterfaces. This is the Language-Models-as-a-Service (LMaaS) paradigm. In\ncontrast with scenarios where full model access is available, as in the case of\nopen-source models, such closed-off language models present specific challenges\nfor evaluating, benchmarking, and testing them. This paper has two goals: on\nthe one hand, we delineate how the aforementioned challenges act as impediments\nto the accessibility, replicability, reliability, and trustworthiness of LMaaS.\nWe systematically examine the issues that arise from a lack of information\nabout language models for each of these four aspects. We conduct a detailed\nanalysis of existing solutions and put forth a number of considered\nrecommendations, and highlight the directions for future advancements. On the\nother hand, it serves as a comprehensive resource for existing knowledge on\ncurrent, major LMaaS, offering a synthesized overview of the licences and\ncapabilities their interfaces offer.\n","authors":["Emanuele La Malfa","Aleksandar Petrov","Simon Frieder","Christoph Weinhuber","Ryan Burnell","Raza Nazar","Anthony G. Cohn","Nigel Shadbolt","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2309.16573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.03215v2","updated":"2023-11-30T06:51:24Z","published":"2022-01-10T08:47:52Z","title":"Handwriting recognition and automatic scoring for descriptive answers in\n Japanese language tests","summary":" This paper presents an experiment of automatically scoring handwritten\ndescriptive answers in the trial tests for the new Japanese university entrance\nexamination, which were made for about 120,000 examinees in 2017 and 2018.\nThere are about 400,000 answers with more than 20 million characters. Although\nall answers have been scored by human examiners, handwritten characters are not\nlabeled. We present our attempt to adapt deep neural network-based handwriting\nrecognizers trained on a labeled handwriting dataset into this unlabeled answer\nset. Our proposed method combines different training strategies, ensembles\nmultiple recognizers, and uses a language model built from a large general\ncorpus to avoid overfitting into specific data. In our experiment, the proposed\nmethod records character accuracy of over 97% using about 2,000 verified\nlabeled answers that account for less than 0.5% of the dataset. Then, the\nrecognized answers are fed into a pre-trained automatic scoring system based on\nthe BERT model without correcting misrecognized characters and providing rubric\nannotations. The automatic scoring system achieves from 0.84 to 0.98 of\nQuadratic Weighted Kappa (QWK). As QWK is over 0.8, it represents an acceptable\nsimilarity of scoring between the automatic scoring system and the human\nexaminers. These results are promising for further research on end-to-end\nautomatic scoring of descriptive answers.\n","authors":["Hung Tuan Nguyen","Cuong Tuan Nguyen","Haruki Oka","Tsunenori Ishioka","Masaki Nakagawa"],"pdf_url":"https://arxiv.org/pdf/2201.03215v2.pdf","comment":"Keywords: handwritten Japanese answers, handwriting recognition,\n automatic scoring, ensemble recognition, deep neural networks; Reported in\n IEICE technical report, PRMU2021-32, pp.45-50 (2021.12) Published after peer\n review and Presented in ICFHR2022, Lecture Notes in Computer Science, vol.\n 13639, pp. 274-284 (2022.11)"},{"id":"http://arxiv.org/abs/2010.01502v3","updated":"2023-11-30T06:44:57Z","published":"2020-10-04T08:00:19Z","title":"Multi-turn Response Selection using Dialogue Dependency Relations","summary":" Multi-turn response selection is a task designed for developing dialogue\nagents. The performance on this task has a remarkable improvement with\npre-trained language models. However, these models simply concatenate the turns\nin dialogue history as the input and largely ignore the dependencies between\nthe turns. In this paper, we propose a dialogue extraction algorithm to\ntransform a dialogue history into threads based on their dependency relations.\nEach thread can be regarded as a self-contained sub-dialogue. We also propose\nThread-Encoder model to encode threads and candidates into compact\nrepresentations by pre-trained Transformers and finally get the matching score\nthrough an attention layer. The experiments show that dependency relations are\nhelpful for dialogue context understanding, and our model outperforms the\nstate-of-the-art baselines on both DSTC7 and DSTC8*, with competitive results\non UbuntuV2.\n","authors":["Qi Jia","Yizhu Liu","Siyu Ren","Kenny Q. Zhu","Haifeng Tang"],"pdf_url":"https://arxiv.org/pdf/2010.01502v3.pdf","comment":"Accepted for publication as a long paper in EMNLP2020"},{"id":"http://arxiv.org/abs/2311.17696v2","updated":"2023-11-30T06:28:22Z","published":"2023-11-29T15:02:46Z","title":"How to Build an AI Tutor that Can Adapt to Any Course and Provide\n Accurate Answers Using Large Language Model and Retrieval-Augmented\n Generation","summary":" Artificial intelligence is transforming education through data-driven,\npersonalized learning solutions. This paper introduces AI Tutor, an innovative\nweb application that provides personalized tutoring in any subject using\nstate-of-the-art Large Language Model (LLM). AI Tutor ingests course materials\nto construct an adaptive knowledge base tailored to the course. When students\npose questions, it retrieves the most relevant information and generates\ndetailed, conversational responses citing supporting evidence. The system is\npowered by advanced large language models and Retrieval-Augmented Generation\n(RAG) techniques for accurate, natural question answering. We present a\nfully-functional web interface and video demonstration that showcase AI Tutor's\nversatility across diverse subjects and its ability to produce pedagogically\ncogent responses. While an initial prototype, this work represents a pioneering\nstep toward AI-enabled tutoring systems that can democratize access to\nhigh-quality, customized educational support.\n","authors":["Chenxi Dong"],"pdf_url":"https://arxiv.org/pdf/2311.17696v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.18260v1","updated":"2023-11-30T05:38:34Z","published":"2023-11-30T05:38:34Z","title":"Consensus, dissensus and synergy between clinicians and specialist\n foundation models in radiology report generation","summary":" Radiology reports are an instrumental part of modern medicine, informing key\nclinical decisions such as diagnosis and treatment. The worldwide shortage of\nradiologists, however, restricts access to expert care and imposes heavy\nworkloads, contributing to avoidable errors and delays in report delivery.\nWhile recent progress in automated report generation with vision-language\nmodels offer clear potential in ameliorating the situation, the path to\nreal-world adoption has been stymied by the challenge of evaluating the\nclinical quality of AI-generated reports. In this study, we build a\nstate-of-the-art report generation system for chest radiographs, Flamingo-CXR,\nby fine-tuning a well-known vision-language foundation model on radiology data.\nTo evaluate the quality of the AI-generated reports, a group of 16 certified\nradiologists provide detailed evaluations of AI-generated and human written\nreports for chest X-rays from an intensive care setting in the United States\nand an inpatient setting in India. At least one radiologist (out of two per\ncase) preferred the AI report to the ground truth report in over 60$\\%$ of\ncases for both datasets. Amongst the subset of AI-generated reports that\ncontain errors, the most frequently cited reasons were related to the location\nand finding, whereas for human written reports, most mistakes were related to\nseverity and finding. This disparity suggested potential complementarity\nbetween our AI system and human experts, prompting us to develop an assistive\nscenario in which Flamingo-CXR generates a first-draft report, which is\nsubsequently revised by a clinician. This is the first demonstration of\nclinician-AI collaboration for report writing, and the resultant reports are\nassessed to be equivalent or preferred by at least one radiologist to reports\nwritten by experts alone in 80$\\%$ of in-patient cases and 66$\\%$ of intensive\ncare cases.\n","authors":["Ryutaro Tanno","David G. T. Barrett","Andrew Sellergren","Sumedh Ghaisas","Sumanth Dathathri","Abigail See","Johannes Welbl","Karan Singhal","Shekoofeh Azizi","Tao Tu","Mike Schaekermann","Rhys May","Roy Lee","SiWai Man","Zahra Ahmed","Sara Mahdavi","Danielle Belgrave","Vivek Natarajan","Shravya Shetty","Pushmeet Kohli","Po-Sen Huang","Alan Karthikesalingam","Ira Ktena"],"pdf_url":"https://arxiv.org/pdf/2311.18260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18248v1","updated":"2023-11-30T04:43:26Z","published":"2023-11-30T04:43:26Z","title":"mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large\n Language Model","summary":" Recently, the strong text creation ability of Large Language Models(LLMs) has\ngiven rise to many tools for assisting paper reading or even writing. However,\nthe weak diagram analysis abilities of LLMs or Multimodal LLMs greatly limit\ntheir application scenarios, especially for scientific academic paper writing.\nIn this work, towards a more versatile copilot for academic paper writing, we\nmainly focus on strengthening the multi-modal diagram analysis ability of\nMultimodal LLMs. By parsing Latex source files of high-quality papers, we\ncarefully build a multi-modal diagram understanding dataset M-Paper. By\naligning diagrams in the paper with related paragraphs, we construct\nprofessional diagram analysis samples for training and evaluation. M-Paper is\nthe first dataset to support joint comprehension of multiple scientific\ndiagrams, including figures and tables in the format of images or Latex codes.\nBesides, to better align the copilot with the user's intention, we introduce\nthe `outline' as the control signal, which could be directly given by the user\nor revised based on auto-generated ones. Comprehensive experiments with a\nstate-of-the-art Mumtimodal LLM demonstrate that training on our dataset shows\nstronger scientific diagram understanding performance, including diagram\ncaptioning, diagram analysis, and outline recommendation. The dataset, code,\nand model are available at\nhttps://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl.\n","authors":["Anwen Hu","Yaya Shi","Haiyang Xu","Jiabo Ye","Qinghao Ye","Ming Yan","Chenliang Li","Qi Qian","Ji Zhang","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2311.18248v1.pdf","comment":"20 pages, 12 figures. arXiv admin note: text overlap with\n arXiv:2305.15225 by other authors"},{"id":"http://arxiv.org/abs/2311.17438v2","updated":"2023-11-30T04:23:58Z","published":"2023-11-29T08:29:54Z","title":"CLOMO: Counterfactual Logical Modification with Large Language Models","summary":" In this study, we delve into the realm of counterfactual reasoning\ncapabilities of large language models (LLMs). Our primary objective is to\ncultivate the counterfactual thought processes within LLMs and rigorously\nassess these processes for their validity. Specifically, we introduce a novel\ntask, Counterfactual Logical Modification (CLOMO), and a high-quality\nhuman-annotated benchmark. In this task, LLMs must adeptly alter a given\nargumentative text to uphold a predetermined logical relationship. To\neffectively evaluate a generation model's counterfactual capabilities, we\npropose an innovative evaluation metric, the LogicAware Counterfactual Score to\ndirectly evaluate the natural language output of LLMs instead of modeling the\ntask as a multiple-choice problem. Analysis shows that the proposed automatic\nmetric aligns well with human preference. Our experimental results show that\nwhile LLMs demonstrate a notable capacity for logical counterfactual thinking,\nthere remains a discernible gap between their current abilities and human\nperformance.\n","authors":["Yinya Huang","Ruixin Hong","Hongming Zhang","Wei Shao","Zhicheng Yang","Dong Yu","Changshui Zhang","Xiaodan Liang","Linqi Song"],"pdf_url":"https://arxiv.org/pdf/2311.17438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18232v1","updated":"2023-11-30T03:59:31Z","published":"2023-11-30T03:59:31Z","title":"LMRL Gym: Benchmarks for Multi-Turn Reinforcement Learning with Language\n Models","summary":" Large language models (LLMs) provide excellent text-generation capabilities,\nbut standard prompting and generation methods generally do not lead to\nintentional or goal-directed agents and might necessitate considerable prompt\ntuning. This becomes particularly apparent in multi-turn conversations: even\nthe best current LLMs rarely ask clarifying questions, engage in explicit\ninformation gathering, or take actions now that lead to better decisions after\nmultiple turns. Reinforcement learning has the potential to leverage the\npowerful modeling capabilities of LLMs, as well as their internal\nrepresentation of textual interactions, to create capable goal-directed\nlanguage agents. This can enable intentional and temporally extended\ninteractions, such as with humans, through coordinated persuasion and carefully\ncrafted questions, or in goal-directed play through text games to bring about\ndesired final outcomes. However, enabling this requires the community to\ndevelop stable and reliable reinforcement learning algorithms that can\neffectively train LLMs. Developing such algorithms requires tasks that can\ngauge progress on algorithm design, provide accessible and reproducible\nevaluations for multi-turn interactions, and cover a range of task properties\nand challenges in improving reinforcement learning algorithms. Our paper\nintroduces the LMRL-Gym benchmark for evaluating multi-turn RL for LLMs,\ntogether with an open-source research framework containing a basic toolkit for\ngetting started on multi-turn RL with offline value-based and policy-based RL\nmethods. Our benchmark consists of 8 different language tasks, which require\nmultiple rounds of language interaction and cover a range of tasks in\nopen-ended dialogue and text games.\n","authors":["Marwa Abdulhai","Isadora White","Charlie Snell","Charles Sun","Joey Hong","Yuexiang Zhai","Kelvin Xu","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2311.18232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18215v1","updated":"2023-11-30T03:19:45Z","published":"2023-11-30T03:19:45Z","title":"Automatic Construction of a Korean Toxic Instruction Dataset for Ethical\n Tuning of Large Language Models","summary":" Caution: this paper may include material that could be offensive or\ndistressing.\n The advent of Large Language Models (LLMs) necessitates the development of\ntraining approaches that mitigate the generation of unethical language and\naptly manage toxic user queries. Given the challenges related to human labor\nand the scarcity of data, we present KoTox, comprising 39K unethical\ninstruction-output pairs. This collection of automatically generated toxic\ninstructions refines the training of LLMs and establishes a foundational\nframework for improving LLMs' ethical awareness and response to various toxic\ninputs, promoting more secure and responsible interactions in Natural Language\nProcessing (NLP) applications.\n","authors":["Sungjoo Byun","Dongjun Jang","Hyemi Jo","Hyopil Shin"],"pdf_url":"https://arxiv.org/pdf/2311.18215v1.pdf","comment":"NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following"},{"id":"http://arxiv.org/abs/2311.18200v1","updated":"2023-11-30T02:39:38Z","published":"2023-11-30T02:39:38Z","title":"INarIG: Iterative Non-autoregressive Instruct Generation Model For\n Word-Level Auto Completion","summary":" Computer-aided translation (CAT) aims to enhance human translation efficiency\nand is still important in scenarios where machine translation cannot meet\nquality requirements. One fundamental task within this field is Word-Level Auto\nCompletion (WLAC). WLAC predicts a target word given a source sentence,\ntranslation context, and a human typed character sequence. Previous works\neither employ word classification models to exploit contextual information from\nboth sides of the target word or directly disregarded the dependencies from the\nright-side context. Furthermore, the key information, i.e. human typed\nsequences, is only used as prefix constraints in the decoding module. In this\npaper, we propose the INarIG (Iterative Non-autoregressive Instruct Generation)\nmodel, which constructs the human typed sequence into Instruction Unit and\nemploys iterative decoding with subwords to fully utilize input information\ngiven in the task. Our model is more competent in dealing with low-frequency\nwords (core scenario of this task), and achieves state-of-the-art results on\nthe WMT22 and benchmark datasets, with a maximum increase of over 10%\nprediction accuracy.\n","authors":["Hengchao Shang","Zongyao Li","Daimeng Wei","Jiaxin Guo","Minghan Wang","Xiaoyu Chen","Lizhi Lei","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2311.18200v1.pdf","comment":"EMNLP2023"},{"id":"http://arxiv.org/abs/2311.18195v1","updated":"2023-11-30T02:27:34Z","published":"2023-11-30T02:27:34Z","title":"COVID-19 Vaccine Misinformation in Middle Income Countries","summary":" This paper introduces a multilingual dataset of COVID-19 vaccine\nmisinformation, consisting of annotated tweets from three middle-income\ncountries: Brazil, Indonesia, and Nigeria. The expertly curated dataset\nincludes annotations for 5,952 tweets, assessing their relevance to COVID-19\nvaccines, presence of misinformation, and the themes of the misinformation. To\naddress challenges posed by domain specificity, the low-resource setting, and\ndata imbalance, we adopt two approaches for developing COVID-19 vaccine\nmisinformation detection models: domain-specific pre-training and text\naugmentation using a large language model. Our best misinformation detection\nmodels demonstrate improvements ranging from 2.7 to 15.9 percentage points in\nmacro F1-score compared to the baseline models. Additionally, we apply our\nmisinformation detection models in a large-scale study of 19 million unlabeled\ntweets from the three countries between 2020 and 2022, showcasing the practical\napplication of our dataset and models for detecting and analyzing vaccine\nmisinformation in multiple countries and languages. Our analysis indicates that\npercentage changes in the number of new COVID-19 cases are positively\nassociated with COVID-19 vaccine misinformation rates in a staggered manner for\nBrazil and Indonesia, and there are significant positive associations between\nthe misinformation rates across the three countries.\n","authors":["Jongin Kim","Byeo Rhee Back","Aditya Agrawal","Jiaxi Wu","Veronika J. Wirtz","Traci Hong","Derry Wijaya"],"pdf_url":"https://arxiv.org/pdf/2311.18195v1.pdf","comment":"Accepted to EMNLP 2023 (Main conference), 9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.18194v1","updated":"2023-11-30T02:26:55Z","published":"2023-11-30T02:26:55Z","title":"Positional Information Matters for Invariant In-Context Learning: A Case\n Study of Simple Function Classes","summary":" In-context learning (ICL) refers to the ability of a model to condition on a\nfew in-context demonstrations (input-output examples of the underlying task) to\ngenerate the answer for a new query input, without updating parameters. Despite\nthe impressive ICL ability of LLMs, it has also been found that ICL in LLMs is\nsensitive to input demonstrations and limited to short context lengths. To\nunderstand the limitations and principles for successful ICL, we conduct an\ninvestigation with ICL linear regression of transformers. We characterize\nseveral Out-of-Distribution (OOD) cases for ICL inspired by realistic LLM ICL\nfailures and compare transformers with DeepSet, a simple yet powerful\narchitecture for ICL. Surprisingly, DeepSet outperforms transformers across a\nvariety of distribution shifts, implying that preserving permutation invariance\nsymmetry to input demonstrations is crucial for OOD ICL. The phenomenon\nspecifies a fundamental requirement by ICL, which we termed as ICL invariance.\nNevertheless, the positional encodings in LLMs will break ICL invariance. To\nthis end, we further evaluate transformers with identical positional encodings\nand find preserving ICL invariance in transformers achieves state-of-the-art\nperformance across various ICL distribution shifts\n","authors":["Yongqiang Chen","Binghui Xie","Kaiwen Zhou","Bo Han","Yatao Bian","James Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.18194v1.pdf","comment":"Ongoing work; preliminary version"},{"id":"http://arxiv.org/abs/2310.00533v3","updated":"2023-11-30T02:18:10Z","published":"2023-10-01T00:52:24Z","title":"SELF: Language-Driven Self-Evolution for Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable versatility across\nvarious domains. To further advance LLMs, we propose 'SELF' (Self-Evolution\nwith Language Feedback), a novel approach that enables LLMs to self-improve\nthrough self-reflection, akin to human learning processes. SELF initiates with\na meta-skill learning process that equips the LLMs with capabilities for\nself-feedback and self-refinement. Subsequently, the model undergoes an\niterative process of self-evolution. In each iteration, it utilizes an\nunlabeled dataset of instructions to generate initial responses. These\nresponses are enhanced through self-feedback and self-refinement. The model is\nthen fine-tuned using this enhanced data. The model undergoes progressive\nimprovement through this iterative self-evolution process. Moreover, the SELF\nframework enables the model to apply self-refinement during inference, which\nfurther improves response quality. Our experiments in mathematics and general\ntasks demonstrate that SELF can enhance the capabilities of LLMs without human\nintervention. The SELF framework indicates a promising direction for the\nautonomous evolution of LLMs, transitioning them from passive information\nreceivers to active participants in their development.\n","authors":["Jianqiao Lu","Wanjun Zhong","Wenyong Huang","Yufei Wang","Fei Mi","Baojun Wang","Weichao Wang","Lifeng Shang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00533v3.pdf","comment":"17 pages, 4 figures, 11 tables"},{"id":"http://arxiv.org/abs/2311.17400v2","updated":"2023-11-30T02:08:24Z","published":"2023-11-29T07:09:13Z","title":"Improving the Robustness of Transformer-based Large Language Models with\n Dynamic Attention","summary":" Transformer-based models, such as BERT and GPT, have been widely adopted in\nnatural language processing (NLP) due to their exceptional performance.\nHowever, recent studies show their vulnerability to textual adversarial attacks\nwhere the model's output can be misled by intentionally manipulating the text\ninputs. Despite various methods that have been proposed to enhance the model's\nrobustness and mitigate this vulnerability, many require heavy consumption\nresources (e.g., adversarial training) or only provide limited protection\n(e.g., defensive dropout). In this paper, we propose a novel method called\ndynamic attention, tailored for the transformer architecture, to enhance the\ninherent robustness of the model itself against various adversarial attacks.\nOur method requires no downstream task knowledge and does not incur additional\ncosts. The proposed dynamic attention consists of two modules: (I) attention\nrectification, which masks or weakens the attention value of the chosen tokens,\nand (ii) dynamic modeling, which dynamically builds the set of candidate\ntokens. Extensive experiments demonstrate that dynamic attention significantly\nmitigates the impact of adversarial attacks, improving up to 33\\% better\nperformance than previous methods against widely-used adversarial attacks. The\nmodel-level design of dynamic attention enables it to be easily combined with\nother defense methods (e.g., adversarial training) to further enhance the\nmodel's robustness. Furthermore, we demonstrate that dynamic attention\npreserves the state-of-the-art robustness space of the original model compared\nto other dynamic modeling methods.\n","authors":["Lujia Shen","Yuwen Pu","Shouling Ji","Changjiang Li","Xuhong Zhang","Chunpeng Ge","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08540v3","updated":"2023-11-30T01:34:31Z","published":"2023-10-12T17:32:09Z","title":"Do pretrained Transformers Really Learn In-context by Gradient Descent?","summary":" The emergence of In-Context Learning (ICL) in LLMs remains a significant\nphenomenon with little understanding. To explain ICL, recent studies try to\nshed light on ICL by connecting it to Gradient Descent (GD). However, the\nquestion is, do these hold up in practice in actual pre-trained models?\n We highlight the limiting assumptions in prior works that make their context\nconsiderably different from the practical context in which language models are\ntrained. For example, the theoretical hand-constructed weights used in these\nstudies have properties that don't match those of real LLMs. Furthermore, their\nexperimental verification uses \\emph{ICL objective} (training models explicitly\nfor ICL), which differs from the emergent ICL in the wild.\n We also look for evidence in real models. We observe that ICL and GD have\ndifferent sensitivity to the order in which they observe demonstrations.\nFinally, we probe and compare the ICL vs. GD hypothesis in a natural setting.\nWe conduct comprehensive empirical analyses on language models pre-trained on\nnatural data (LLaMa-7B). Our comparisons of three performance metrics highlight\nthe inconsistent behavior of ICL and GD as a function of various factors such\nas datasets, models, and the number of demonstrations. We observe that ICL and\nGD modify the output distribution of language models differently. These results\nindicate that the equivalence between ICL and GD remains an open hypothesis and\ncalls for further studies.\n","authors":["Lingfeng Shen","Aayush Mishra","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2310.08540v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07384v3","updated":"2023-11-30T01:01:32Z","published":"2023-06-12T19:20:18Z","title":"Probing Quantifier Comprehension in Large Language Models: Another\n Example of Inverse Scaling","summary":" With their increasing size, large language models (LLMs) are becoming\nincreasingly good at language understanding tasks. But even with high\nperformance on specific downstream task, LLMs fail at simple linguistic tests\nfor negation or quantifier understanding. Previous work on quantifier\nunderstanding in LLMs show inverse scaling in understanding few-type\nquantifiers. In this paper, we question the claims of of previous work and show\nthat it is a result of inappropriate testing methodology. We also present\nalternate methods to measure quantifier comprehension in LLMs and show that\nLLMs are able to better understand the difference between the meaning of\nfew-type and most-type quantifiers as their size increases, although they are\nnot particularly good at it. We also observe inverse scaling for most-type\nquantifier understanding, which is contrary to human psycho-linguistic\nexperiments and previous work, where the model's understanding of most-type\nquantifier gets worse as the model size increases. We do this evaluation on\nmodels ranging from 125M-175B parameters, which suggests that LLMs do not do as\nwell as expected with quantifiers. We also discuss the possible reasons for\nthis and the relevance of quantifier understanding in evaluating language\nunderstanding in LLMs.\n","authors":["Akshat Gupta"],"pdf_url":"https://arxiv.org/pdf/2306.07384v3.pdf","comment":"Accepted to BlackboxNLP (EMNLP 2023)"},{"id":"http://arxiv.org/abs/2311.18248v1","updated":"2023-11-30T04:43:26Z","published":"2023-11-30T04:43:26Z","title":"mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large\n Language Model","summary":" Recently, the strong text creation ability of Large Language Models(LLMs) has\ngiven rise to many tools for assisting paper reading or even writing. However,\nthe weak diagram analysis abilities of LLMs or Multimodal LLMs greatly limit\ntheir application scenarios, especially for scientific academic paper writing.\nIn this work, towards a more versatile copilot for academic paper writing, we\nmainly focus on strengthening the multi-modal diagram analysis ability of\nMultimodal LLMs. By parsing Latex source files of high-quality papers, we\ncarefully build a multi-modal diagram understanding dataset M-Paper. By\naligning diagrams in the paper with related paragraphs, we construct\nprofessional diagram analysis samples for training and evaluation. M-Paper is\nthe first dataset to support joint comprehension of multiple scientific\ndiagrams, including figures and tables in the format of images or Latex codes.\nBesides, to better align the copilot with the user's intention, we introduce\nthe `outline' as the control signal, which could be directly given by the user\nor revised based on auto-generated ones. Comprehensive experiments with a\nstate-of-the-art Mumtimodal LLM demonstrate that training on our dataset shows\nstronger scientific diagram understanding performance, including diagram\ncaptioning, diagram analysis, and outline recommendation. The dataset, code,\nand model are available at\nhttps://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl.\n","authors":["Anwen Hu","Yaya Shi","Haiyang Xu","Jiabo Ye","Qinghao Ye","Ming Yan","Chenliang Li","Qi Qian","Ji Zhang","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2311.18248v1.pdf","comment":"20 pages, 12 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.18838v1","updated":"2023-11-30T18:59:56Z","published":"2023-11-30T18:59:56Z","title":"Dataset Distillation in Large Data Era","summary":" Dataset distillation aims to generate a smaller but representative subset\nfrom a large dataset, which allows a model to be trained efficiently, meanwhile\nevaluating on the original testing data distribution to achieve decent\nperformance. Many prior works have aimed to align with diverse aspects of the\noriginal datasets, such as matching the training weight trajectories, gradient,\nfeature/BatchNorm distributions, etc. In this work, we show how to distill\nvarious large-scale datasets such as full ImageNet-1K/21K under a conventional\ninput resolution of 224$\\times$224 to achieve the best accuracy over all\nprevious approaches, including SRe$^2$L, TESLA and MTT. To achieve this, we\nintroduce a simple yet effective ${\\bf C}$urriculum ${\\bf D}$ata ${\\bf\nA}$ugmentation ($\\texttt{CDA}$) during data synthesis that obtains the accuracy\non large-scale ImageNet-1K and 21K with 63.2% under IPC (Images Per Class) 50\nand 36.1% under IPC 20, respectively. Finally, we show that, by integrating all\nour enhancements together, the proposed model beats the current\nstate-of-the-art by more than 4% Top-1 accuracy on ImageNet-1K/21K and for the\nfirst time, reduces the gap to its full-data training counterpart to less than\nabsolute 15%. Moreover, this work represents the inaugural success in dataset\ndistillation on larger-scale ImageNet-21K under the standard 224$\\times$224\nresolution. Our code and distilled ImageNet-21K dataset of 20 IPC, 2K recovery\nbudget are available at https://github.com/VILA-Lab/SRe2L/tree/main/CDA.\n","authors":["Zeyuan Yin","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2311.18838v1.pdf","comment":"Code and distilled ImageNet-21K dataset are available at\n https://github.com/VILA-Lab/SRe2L/tree/main/CDA"},{"id":"http://arxiv.org/abs/2311.18839v1","updated":"2023-11-30T18:59:56Z","published":"2023-11-30T18:59:56Z","title":"TrafficMOT: A Challenging Dataset for Multi-Object Tracking in Complex\n Traffic Scenarios","summary":" Multi-object tracking in traffic videos is a crucial research area, offering\nimmense potential for enhancing traffic monitoring accuracy and promoting road\nsafety measures through the utilisation of advanced machine learning\nalgorithms. However, existing datasets for multi-object tracking in traffic\nvideos often feature limited instances or focus on single classes, which cannot\nwell simulate the challenges encountered in complex traffic scenarios. To\naddress this gap, we introduce TrafficMOT, an extensive dataset designed to\nencompass diverse traffic situations with complex scenarios. To validate the\ncomplexity and challenges presented by TrafficMOT, we conducted comprehensive\nempirical studies using three different settings: fully-supervised,\nsemi-supervised, and a recent powerful zero-shot foundation model Tracking\nAnything Model (TAM). The experimental results highlight the inherent\ncomplexity of this dataset, emphasising its value in driving advancements in\nthe field of traffic monitoring and multi-object tracking.\n","authors":["Lihao Liu","Yanqi Cheng","Zhongying Deng","Shujun Wang","Dongdong Chen","Xiaowei Hu","Pietro Liò","Carola-Bibiane Schönlieb","Angelica Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2311.18839v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.18840v1","updated":"2023-11-30T18:59:56Z","published":"2023-11-30T18:59:56Z","title":"Just Add $π$! Pose Induced Video Transformers for Understanding\n Activities of Daily Living","summary":" Video transformers have become the de facto standard for human action\nrecognition, yet their exclusive reliance on the RGB modality still limits\ntheir adoption in certain domains. One such domain is Activities of Daily\nLiving (ADL), where RGB alone is not sufficient to distinguish between visually\nsimilar actions, or actions observed from multiple viewpoints. To facilitate\nthe adoption of video transformers for ADL, we hypothesize that the\naugmentation of RGB with human pose information, known for its sensitivity to\nfine-grained motion and multiple viewpoints, is essential. Consequently, we\nintroduce the first Pose Induced Video Transformer: PI-ViT (or $\\pi$-ViT), a\nnovel approach that augments the RGB representations learned by video\ntransformers with 2D and 3D pose information. The key elements of $\\pi$-ViT are\ntwo plug-in modules, 2D Skeleton Induction Module and 3D Skeleton Induction\nModule, that are responsible for inducing 2D and 3D pose information into the\nRGB representations. These modules operate by performing pose-aware auxiliary\ntasks, a design choice that allows $\\pi$-ViT to discard the modules during\ninference. Notably, $\\pi$-ViT achieves the state-of-the-art performance on\nthree prominent ADL datasets, encompassing both real-world and large-scale\nRGB-D datasets, without requiring poses or additional computational overhead at\ninference.\n","authors":["Dominick Reilly","Srijan Das"],"pdf_url":"https://arxiv.org/pdf/2311.18840v1.pdf","comment":"Code and models will be released at:\n https://github.com/dominickrei/pi-vit"},{"id":"http://arxiv.org/abs/2311.18836v1","updated":"2023-11-30T18:59:52Z","published":"2023-11-30T18:59:52Z","title":"PoseGPT: Chatting about 3D Human Pose","summary":" We introduce PoseGPT, a framework employing Large Language Models (LLMs) to\nunderstand and reason about 3D human poses from images or textual descriptions.\nOur work is motivated by the human ability to intuitively understand postures\nfrom a single image or a brief description, a process that intertwines image\ninterpretation, world knowledge, and an understanding of body language.\nTraditional human pose estimation methods, whether image-based or text-based,\noften lack holistic scene comprehension and nuanced reasoning, leading to a\ndisconnect between visual data and its real-world implications. PoseGPT\naddresses these limitations by embedding SMPL poses as a distinct signal token\nwithin a multi-modal LLM, enabling direct generation of 3D body poses from both\ntextual and visual inputs. This approach not only simplifies pose prediction\nbut also empowers LLMs to apply their world knowledge in reasoning about human\nposes, fostering two advanced tasks: speculative pose generation and reasoning\nabout pose estimation. These tasks involve reasoning about humans to generate\n3D poses from subtle text queries, possibly accompanied by images. We establish\nbenchmarks for these tasks, moving beyond traditional 3D pose generation and\nestimation methods. Our results show that PoseGPT outperforms existing\nmultimodal LLMs and task-sepcific methods on these newly proposed tasks.\nFurthermore, PoseGPT's ability to understand and generate 3D human poses based\non complex reasoning opens new directions in human pose analysis.\n","authors":["Yao Feng","Jing Lin","Sai Kumar Dwivedi","Yu Sun","Priyanka Patel","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2311.18836v1.pdf","comment":"Home page: https://yfeng95.github.io/posegpt"},{"id":"http://arxiv.org/abs/2311.18837v1","updated":"2023-11-30T18:59:52Z","published":"2023-11-30T18:59:52Z","title":"VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion\n Models","summary":" Diffusion models have achieved significant success in image and video\ngeneration. This motivates a growing interest in video editing tasks, where\nvideos are edited according to provided text descriptions. However, most\nexisting approaches only focus on video editing for short clips and rely on\ntime-consuming tuning or inference. We are the first to propose Video\nInstruction Diffusion (VIDiff), a unified foundation model designed for a wide\nrange of video tasks. These tasks encompass both understanding tasks (such as\nlanguage-guided video object segmentation) and generative tasks (video editing\nand enhancement). Our model can edit and translate the desired results within\nseconds based on user instructions. Moreover, we design an iterative\nauto-regressive method to ensure consistency in editing and enhancing long\nvideos. We provide convincing generative results for diverse input videos and\nwritten instructions, both qualitatively and quantitatively. More examples can\nbe found at our website https://ChenHsing.github.io/VIDiff.\n","authors":["Zhen Xing","Qi Dai","Zihao Zhang","Hui Zhang","Han Hu","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.18837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18835v1","updated":"2023-11-30T18:59:51Z","published":"2023-11-30T18:59:51Z","title":"InstructSeq: Unifying Vision Tasks with Instruction-conditioned\n Multi-modal Sequence Generation","summary":" Empowering models to dynamically accomplish tasks specified through natural\nlanguage instructions represents a promising path toward more capable and\ngeneral artificial intelligence. In this work, we introduce InstructSeq, an\ninstruction-conditioned multi-modal modeling framework that unifies diverse\nvision tasks through flexible natural language control and handling of both\nvisual and textual data. InstructSeq employs a multimodal transformer\narchitecture encompassing visual, language, and sequential modeling. We utilize\na visual encoder to extract image features and a text encoder to encode\ninstructions. An autoregressive transformer fuses the representations and\ngenerates sequential task outputs. By training with LLM-generated natural\nlanguage instructions, InstructSeq acquires a strong comprehension of free-form\ninstructions for specifying visual tasks. This provides an intuitive interface\nfor directing capabilities using flexible natural instructions. Without any\ntask-specific tuning, InstructSeq achieves compelling performance on semantic\nsegmentation, referring expression segmentation/comprehension, and image\ncaptioning. The flexible control and multi-task unification empower the model\nwith more human-like versatility and generalizability for computer vision. The\ncode will be released soon at https://github.com/rongyaofang/InstructSeq.\n","authors":["Rongyao Fang","Shilin Yan","Zhaoyang Huang","Jingqiu Zhou","Hao Tian","Jifeng Dai","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2311.18835v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2311.18834v1","updated":"2023-11-30T18:59:47Z","published":"2023-11-30T18:59:47Z","title":"ART$\\boldsymbol{\\cdot}$V: Auto-Regressive Text-to-Video Generation with\n Diffusion Models","summary":" We present ART$\\boldsymbol{\\cdot}$V, an efficient framework for\nauto-regressive video generation with diffusion models. Unlike existing methods\nthat generate entire videos in one-shot, ART$\\boldsymbol{\\cdot}$V generates a\nsingle frame at a time, conditioned on the previous ones. The framework offers\nthree distinct advantages. First, it only learns simple continual motions\nbetween adjacent frames, therefore avoiding modeling complex long-range motions\nthat require huge training data. Second, it preserves the high-fidelity\ngeneration ability of the pre-trained image diffusion models by making only\nminimal network modifications. Third, it can generate arbitrarily long videos\nconditioned on a variety of prompts such as text, image or their combinations,\nmaking it highly versatile and flexible. To combat the common drifting issue in\nAR models, we propose masked diffusion model which implicitly learns which\ninformation can be drawn from reference images rather than network predictions,\nin order to reduce the risk of generating inconsistent appearances that cause\ndrifting. Moreover, we further enhance generation coherence by conditioning it\non the initial frame, which typically contains minimal noise. This is\nparticularly useful for long video generation. When trained for only two weeks\non four GPUs, ART$\\boldsymbol{\\cdot}$V already can generate videos with natural\nmotions, rich details and a high level of aesthetic quality. Besides, it\nenables various appealing applications, e.g., composing a long video from\nmultiple text prompts.\n","authors":["Wenming Weng","Ruoyu Feng","Yanhui Wang","Qi Dai","Chunyu Wang","Dacheng Yin","Zhiyuan Zhao","Kai Qiu","Jianmin Bao","Yuhui Yuan","Chong Luo","Yueyi Zhang","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2311.18834v1.pdf","comment":"24 pages, 21 figures. Project page at\n https://warranweng.github.io/art.v"},{"id":"http://arxiv.org/abs/2311.18832v1","updated":"2023-11-30T18:59:44Z","published":"2023-11-30T18:59:44Z","title":"Exploiting Diffusion Prior for Generalizable Pixel-Level Semantic\n Prediction","summary":" Contents generated by recent advanced Text-to-Image (T2I) diffusion models\nare sometimes too imaginative for existing off-the-shelf property semantic\npredictors to estimate due to the immitigable domain gap. We introduce DMP, a\npipeline utilizing pre-trained T2I models as a prior for pixel-level semantic\nprediction tasks. To address the misalignment between deterministic prediction\ntasks and stochastic T2I models, we reformulate the diffusion process through a\nsequence of interpolations, establishing a deterministic mapping between input\nRGB images and output prediction distributions. To preserve generalizability,\nwe use low-rank adaptation to fine-tune pre-trained models. Extensive\nexperiments across five tasks, including 3D property estimation, semantic\nsegmentation, and intrinsic image decomposition, showcase the efficacy of the\nproposed method. Despite limited-domain training data, the approach yields\nfaithful estimations for arbitrary images, surpassing existing state-of-the-art\nalgorithms.\n","authors":["Hsin-Ying Lee","Hung-Yu Tseng","Hsin-Ying Lee","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.18832v1.pdf","comment":"Project page: https://shinying.github.io/dmp"},{"id":"http://arxiv.org/abs/2311.18830v1","updated":"2023-11-30T18:59:33Z","published":"2023-11-30T18:59:33Z","title":"MotionEditor: Editing Video Motion via Content-Aware Diffusion","summary":" Existing diffusion-based video editing models have made gorgeous advances for\nediting attributes of a source video over time but struggle to manipulate the\nmotion information while preserving the original protagonist's appearance and\nbackground. To address this, we propose MotionEditor, a diffusion model for\nvideo motion editing. MotionEditor incorporates a novel content-aware motion\nadapter into ControlNet to capture temporal motion correspondence. While\nControlNet enables direct generation based on skeleton poses, it encounters\nchallenges when modifying the source motion in the inverted noise due to\ncontradictory signals between the noise (source) and the condition (reference).\nOur adapter complements ControlNet by involving source content to transfer\nadapted control signals seamlessly. Further, we build up a two-branch\narchitecture (a reconstruction branch and an editing branch) with a\nhigh-fidelity attention injection mechanism facilitating branch interaction.\nThis mechanism enables the editing branch to query the key and value from the\nreconstruction branch in a decoupled manner, making the editing branch retain\nthe original background and protagonist appearance. We also propose a skeleton\nalignment algorithm to address the discrepancies in pose size and position.\nExperiments demonstrate the promising motion editing ability of MotionEditor,\nboth qualitatively and quantitatively.\n","authors":["Shuyuan Tu","Qi Dai","Zhi-Qi Cheng","Han Hu","Xintong Han","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.18830v1.pdf","comment":"18 pages, 15 figures. Project page at\n https://francis-rings.github.io/MotionEditor/"},{"id":"http://arxiv.org/abs/2311.18829v1","updated":"2023-11-30T18:59:30Z","published":"2023-11-30T18:59:30Z","title":"MicroCinema: A Divide-and-Conquer Approach for Text-to-Video Generation","summary":" We present MicroCinema, a straightforward yet effective framework for\nhigh-quality and coherent text-to-video generation. Unlike existing approaches\nthat align text prompts with video directly, MicroCinema introduces a\nDivide-and-Conquer strategy which divides the text-to-video into a two-stage\nprocess: text-to-image generation and image\\&text-to-video generation. This\nstrategy offers two significant advantages. a) It allows us to take full\nadvantage of the recent advances in text-to-image models, such as Stable\nDiffusion, Midjourney, and DALLE, to generate photorealistic and highly\ndetailed images. b) Leveraging the generated image, the model can allocate less\nfocus to fine-grained appearance details, prioritizing the efficient learning\nof motion dynamics. To implement this strategy effectively, we introduce two\ncore designs. First, we propose the Appearance Injection Network, enhancing the\npreservation of the appearance of the given image. Second, we introduce the\nAppearance Noise Prior, a novel mechanism aimed at maintaining the capabilities\nof pre-trained 2D diffusion models. These design elements empower MicroCinema\nto generate high-quality videos with precise motion, guided by the provided\ntext prompts. Extensive experiments demonstrate the superiority of the proposed\nframework. Concretely, MicroCinema achieves SOTA zero-shot FVD of 342.86 on\nUCF-101 and 377.40 on MSR-VTT. See\nhttps://wangyanhui666.github.io/MicroCinema.github.io/ for video samples.\n","authors":["Yanhui Wang","Jianmin Bao","Wenming Weng","Ruoyu Feng","Dacheng Yin","Tao Yang","Jingxu Zhang","Qi Dai Zhiyuan Zhao","Chunyu Wang","Kai Qiu","Yuhui Yuan","Xiaoyan Sun","Chong Luo","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2311.18829v1.pdf","comment":"Project page: https://wangyanhui666.github.io/MicroCinema.github.io/"},{"id":"http://arxiv.org/abs/2311.18828v1","updated":"2023-11-30T18:59:20Z","published":"2023-11-30T18:59:20Z","title":"One-step Diffusion with Distribution Matching Distillation","summary":" Diffusion models generate high-quality images but require dozens of forward\npasses. We introduce Distribution Matching Distillation (DMD), a procedure to\ntransform a diffusion model into a one-step image generator with minimal impact\non image quality. We enforce the one-step image generator match the diffusion\nmodel at distribution level, by minimizing an approximate KL divergence whose\ngradient can be expressed as the difference between 2 score functions, one of\nthe target distribution and the other of the synthetic distribution being\nproduced by our one-step generator. The score functions are parameterized as\ntwo diffusion models trained separately on each distribution. Combined with a\nsimple regression loss matching the large-scale structure of the multi-step\ndiffusion outputs, our method outperforms all published few-step diffusion\napproaches, reaching 2.62 FID on ImageNet 64x64 and 11.49 FID on zero-shot\nCOCO-30k, comparable to Stable Diffusion but orders of magnitude faster.\nUtilizing FP16 inference, our model can generate images at 20 FPS on modern\nhardware.\n","authors":["Tianwei Yin","Michaël Gharbi","Richard Zhang","Eli Shechtman","Frédo Durand","William T. Freeman","Taesung Park"],"pdf_url":"https://arxiv.org/pdf/2311.18828v1.pdf","comment":"Project page: https://tianweiy.github.io/dmd/"},{"id":"http://arxiv.org/abs/2311.18827v1","updated":"2023-11-30T18:59:06Z","published":"2023-11-30T18:59:06Z","title":"Motion-Conditioned Image Animation for Video Editing","summary":" We introduce MoCA, a Motion-Conditioned Image Animation approach for video\nediting. It leverages a simple decomposition of the video editing problem into\nimage editing followed by motion-conditioned image animation. Furthermore,\ngiven the lack of robust evaluation datasets for video editing, we introduce a\nnew benchmark that measures edit capability across a wide variety of tasks,\nsuch as object replacement, background changes, style changes, and motion\nedits. We present a comprehensive human evaluation of the latest video editing\nmethods along with MoCA, on our proposed benchmark. MoCA establishes a new\nstate-of-the-art, demonstrating greater human preference win-rate, and\noutperforming notable recent approaches including Dreamix (63%), MasaCtrl\n(75%), and Tune-A-Video (72%), with especially significant improvements for\nmotion edits.\n","authors":["Wilson Yan","Andrew Brown","Pieter Abbeel","Rohit Girdhar","Samaneh Azadi"],"pdf_url":"https://arxiv.org/pdf/2311.18827v1.pdf","comment":"Project page: https://facebookresearch.github.io/MoCA"},{"id":"http://arxiv.org/abs/2311.17898v2","updated":"2023-11-30T18:59:01Z","published":"2023-11-29T18:51:46Z","title":"Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis","summary":" Hallucinations and unfaithful synthesis due to inaccurate prompts with\ninsufficient semantic details are widely observed in multimodal generative\nmodels. A prevalent strategy to align multiple modalities is to fine-tune the\ngenerator with a large number of annotated text-image pairs. However, such a\nprocedure is labor-consuming and resource-draining. The key question we ask is:\ncan we enhance the quality and faithfulness of text-driven generative models\nbeyond extensive text-image pair annotations? To address this question, we\npropose Knowledge Pursuit Prompting (KPP), a zero-shot framework that\niteratively incorporates external knowledge to help generators produce reliable\nvisual content. Instead of training generators to handle generic prompts, KPP\nemploys a recursive knowledge query process to gather informative external\nfacts from the knowledge base, instructs a language model to compress the\nacquired knowledge for prompt refinement, and utilizes text-driven generators\nfor visual synthesis. The entire process is zero-shot, without accessing the\narchitectures and parameters of generative models. We evaluate the framework\nacross multiple text-driven generative tasks (image, 3D rendering, and video)\non datasets of different domains. We further demonstrate the extensibility and\nadaptability of KPP through varying foundation model bases and instructions.\nOur results show that KPP is capable of generating faithful and semantically\nrich content across diverse visual domains, offering a promising solution to\nimprove multimodal generative models.\n","authors":["Jinqi Luo","Kwan Ho Ryan Chan","Dimitris Dimos","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2311.17898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18825v1","updated":"2023-11-30T18:58:51Z","published":"2023-11-30T18:58:51Z","title":"CAST: Cross-Attention in Space and Time for Video Action Recognition","summary":" Recognizing human actions in videos requires spatial and temporal\nunderstanding. Most existing action recognition models lack a balanced\nspatio-temporal understanding of videos. In this work, we propose a novel\ntwo-stream architecture, called Cross-Attention in Space and Time (CAST), that\nachieves a balanced spatio-temporal understanding of videos using only RGB\ninput. Our proposed bottleneck cross-attention mechanism enables the spatial\nand temporal expert models to exchange information and make synergistic\npredictions, leading to improved performance. We validate the proposed method\nwith extensive experiments on public benchmarks with different characteristics:\nEPIC-KITCHENS-100, Something-Something-V2, and Kinetics-400. Our method\nconsistently shows favorable performance across these datasets, while the\nperformance of existing methods fluctuates depending on the dataset\ncharacteristics.\n","authors":["Dongho Lee","Jongseo Lee","Jinwoo Choi"],"pdf_url":"https://arxiv.org/pdf/2311.18825v1.pdf","comment":"This is an accepted NeurIPS 2023. Project webpage is available at\n https://jong980812.github.io/CAST.github.io/ Code is available at\n https://github.com/KHU-VLL/CAST"},{"id":"http://arxiv.org/abs/2311.18823v1","updated":"2023-11-30T18:58:26Z","published":"2023-11-30T18:58:26Z","title":"Initializing Models with Larger Ones","summary":" Weight initialization plays an important role in neural network training.\nWidely used initialization methods are proposed and evaluated for networks that\nare trained from scratch. However, the growing number of pretrained models now\noffers new opportunities for tackling this classical problem of weight\ninitialization. In this work, we introduce weight selection, a method for\ninitializing smaller models by selecting a subset of weights from a pretrained\nlarger model. This enables the transfer of knowledge from pretrained weights to\nsmaller models. Our experiments demonstrate that weight selection can\nsignificantly enhance the performance of small models and reduce their training\ntime. Notably, it can also be used together with knowledge distillation. Weight\nselection offers a new approach to leverage the power of pretrained models in\nresource-constrained settings, and we hope it can be a useful tool for training\nsmall models in the large-model era. Code is available at\nhttps://github.com/OscarXZQ/weight-selection.\n","authors":["Zhiqiu Xu","Yanjie Chen","Kirill Vishniakov","Yida Yin","Zhiqiang Shen","Trevor Darrell","Lingjie Liu","Zhuang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18822v1","updated":"2023-11-30T18:58:17Z","published":"2023-11-30T18:58:17Z","title":"ElasticDiffusion: Training-free Arbitrary Size Image Generation","summary":" Diffusion models have revolutionized image generation in recent years, yet\nthey are still limited to a few sizes and aspect ratios. We propose\nElasticDiffusion, a novel training-free decoding method that enables pretrained\ntext-to-image diffusion models to generate images with various sizes.\nElasticDiffusion attempts to decouple the generation trajectory of a pretrained\nmodel into local and global signals. The local signal controls low-level pixel\ninformation and can be estimated on local patches, while the global signal is\nused to maintain overall structural consistency and is estimated with a\nreference image. We test our method on CelebA-HQ (faces) and LAION-COCO\n(objects/indoor/outdoor scenes). Our experiments and qualitative results show\nsuperior image coherence quality across aspect ratios compared to\nMultiDiffusion and the standard decoding strategy of Stable Diffusion. Code:\nhttps://github.com/MoayedHajiAli/ElasticDiffusion-official.git\n","authors":["Moayed Haji-Ali","Guha Balakrishnan","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2311.18822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14656v2","updated":"2023-11-30T18:56:42Z","published":"2023-11-24T18:46:02Z","title":"Charting New Territories: Exploring the Geographic and Geospatial\n Capabilities of Multimodal LLMs","summary":" Multimodal large language models (MLLMs) have shown remarkable capabilities\nacross a broad range of tasks but their knowledge and abilities in the\ngeographic and geospatial domains are yet to be explored, despite potential\nwide-ranging benefits to navigation, environmental research, urban development,\nand disaster response. We conduct a series of experiments exploring various\nvision capabilities of MLLMs within these domains, particularly focusing on the\nfrontier model GPT-4V, and benchmark its performance against open-source\ncounterparts. Our methodology involves challenging these models with a\nsmall-scale geographic benchmark consisting of a suite of visual tasks, testing\ntheir abilities across a spectrum of complexity. The analysis uncovers not only\nwhere such models excel, including instances where they outperform humans, but\nalso where they falter, providing a balanced view of their capabilities in the\ngeographic domain. To enable the comparison and evaluation of future models,\nour benchmark will be publicly released.\n","authors":["Jonathan Roberts","Timo Lüddecke","Rehan Sheikh","Kai Han","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2311.14656v2.pdf","comment":"V2: Minor formatting changes and added missing subfigure captions"},{"id":"http://arxiv.org/abs/2311.18815v1","updated":"2023-11-30T18:55:16Z","published":"2023-11-30T18:55:16Z","title":"IMMA: Immunizing text-to-image Models against Malicious Adaptation","summary":" Advancements in text-to-image models and fine-tuning methods have led to the\nincreasing risk of malicious adaptation, i.e., fine-tuning to generate harmful\nunauthorized content. Recent works, e.g., Glaze or MIST, have developed\ndata-poisoning techniques which protect the data against adaptation methods. In\nthis work, we consider an alternative paradigm for protection. We propose to\n``immunize'' the model by learning model parameters that are difficult for the\nadaptation methods when fine-tuning malicious content; in short IMMA. Empirical\nresults show IMMA's effectiveness against malicious adaptations, including\nmimicking the artistic style and learning of inappropriate/unauthorized\ncontent, over three adaptation methods: LoRA, Textual-Inversion, and\nDreamBooth.\n","authors":["Yijia Zheng","Raymond A. Yeh"],"pdf_url":"https://arxiv.org/pdf/2311.18815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18814v1","updated":"2023-11-30T18:54:08Z","published":"2023-11-30T18:54:08Z","title":"Is Underwater Image Enhancement All Object Detectors Need?","summary":" Underwater object detection is a crucial and challenging problem in marine\nengineering and aquatic robot. The difficulty is partly because of the\ndegradation of underwater images caused by light selective absorption and\nscattering. Intuitively, enhancing underwater images can benefit high-level\napplications like underwater object detection. However, it is still unclear\nwhether all object detectors need underwater image enhancement as\npre-processing. We therefore pose the questions \"Does underwater image\nenhancement really improve underwater object detection?\" and \"How does\nunderwater image enhancement contribute to underwater object detection?\". With\nthese two questions, we conduct extensive studies. Specifically, we use 18\nstate-of-the-art underwater image enhancement algorithms, covering traditional,\nCNN-based, and GAN-based algorithms, to pre-process underwater object detection\ndata. Then, we retrain 7 popular deep learning-based object detectors using the\ncorresponding results enhanced by different algorithms, obtaining 126\nunderwater object detection models. Coupled with 7 object detection models\nretrained using raw underwater images, we employ these 133 models to\ncomprehensively analyze the effect of underwater image enhancement on\nunderwater object detection. We expect this study can provide sufficient\nexploration to answer the aforementioned questions and draw more attention of\nthe community to the joint problem of underwater image enhancement and\nunderwater object detection. The pre-trained models and results are publicly\navailable and will be regularly updated. Project page:\nhttps://github.com/BIGWangYuDong/lqit/tree/main/configs/detection/uw_enhancement_affect_detection.\n","authors":["Yudong Wang","Jichang Guo","Wanru He","Huan Gao","Huihui Yue","Zenan Zhang","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2311.18814v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.18810v1","updated":"2023-11-30T18:52:47Z","published":"2023-11-30T18:52:47Z","title":"Convergence of Nonconvex PnP-ADMM with MMSE Denoisers","summary":" Plug-and-Play Alternating Direction Method of Multipliers (PnP-ADMM) is a\nwidely-used algorithm for solving inverse problems by integrating physical\nmeasurement models and convolutional neural network (CNN) priors. PnP-ADMM has\nbeen theoretically proven to converge for convex data-fidelity terms and\nnonexpansive CNNs. It has however been observed that PnP-ADMM often empirically\nconverges even for expansive CNNs. This paper presents a theoretical\nexplanation for the observed stability of PnP-ADMM based on the interpretation\nof the CNN prior as a minimum mean-squared error (MMSE) denoiser. Our\nexplanation parallels a similar argument recently made for the iterative\nshrinkage/thresholding algorithm variant of PnP (PnP-ISTA) and relies on the\nconnection between MMSE denoisers and proximal operators. We also numerically\nevaluate the performance gap between PnP-ADMM using a nonexpansive DnCNN\ndenoiser and expansive DRUNet denoiser, thus motivating the use of expansive\nCNNs.\n","authors":["Chicago Park","Shirin Shoushtari","Weijie Gan","Ulugbek S. Kamilov"],"pdf_url":"https://arxiv.org/pdf/2311.18810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18809v1","updated":"2023-11-30T18:52:29Z","published":"2023-11-30T18:52:29Z","title":"FoundPose: Unseen Object Pose Estimation with Foundation Features","summary":" We propose FoundPose, a method for 6D pose estimation of unseen rigid objects\nfrom a single RGB image. The method assumes that 3D models of the objects are\navailable but does not require any object-specific training. This is achieved\nby building upon DINOv2, a recent vision foundation model with impressive\ngeneralization capabilities. An online pose estimation stage is supported by a\nminimal object representation that is built during a short onboarding stage\nfrom DINOv2 patch features extracted from rendered object templates. Given a\nquery image with an object segmentation mask, FoundPose first rapidly retrieves\na handful of similarly looking templates by a DINOv2-based bag-of-words\napproach. Pose hypotheses are then generated from 2D-3D correspondences\nestablished by matching DINOv2 patch features between the query image and a\nretrieved template, and finally optimized by featuremetric refinement. The\nmethod can handle diverse objects, including challenging ones with symmetries\nand without any texture, and noticeably outperforms existing RGB methods for\ncoarse pose estimation in both accuracy and speed on the standard BOP\nbenchmark. With the featuremetric and additional MegaPose refinement, which are\ndemonstrated complementary, the method outperforms all RGB competitors. Source\ncode is at: evinpinar.github.io/foundpose.\n","authors":["Evin Pınar Örnek","Yann Labbé","Bugra Tekin","Lingni Ma","Cem Keskin","Christian Forster","Tomas Hodan"],"pdf_url":"https://arxiv.org/pdf/2311.18809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18803v1","updated":"2023-11-30T18:49:43Z","published":"2023-11-30T18:49:43Z","title":"BIOCLIP: A Vision Foundation Model for the Tree of Life","summary":" Images of the natural world, collected by a variety of cameras, from drones\nto individual phones, are increasingly abundant sources of biological\ninformation. There is an explosion of computational methods and tools,\nparticularly computer vision, for extracting biologically relevant information\nfrom images for science and conservation. Yet most of these are bespoke\napproaches designed for a specific task and are not easily adaptable or\nextendable to new questions, contexts, and datasets. A vision model for general\norganismal biology questions on images is of timely need. To approach this, we\ncurate and release TreeOfLife-10M, the largest and most diverse ML-ready\ndataset of biology images. We then develop BioCLIP, a foundation model for the\ntree of life, leveraging the unique properties of biology captured by\nTreeOfLife-10M, namely the abundance and variety of images of plants, animals,\nand fungi, together with the availability of rich structured biological\nknowledge. We rigorously benchmark our approach on diverse fine-grained biology\nclassification tasks, and find that BioCLIP consistently and substantially\noutperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation\nreveals that BioCLIP has learned a hierarchical representation conforming to\nthe tree of life, shedding light on its strong generalizability. Our code,\nmodels and data will be made available at\nhttps://github.com/Imageomics/bioclip.\n","authors":["Samuel Stevens","Jiaman Wu","Matthew J Thompson","Elizabeth G Campolongo","Chan Hee Song","David Edward Carlyn","Li Dong","Wasila M Dahdul","Charles Stewart","Tanya Berger-Wolf","Wei-Lun Chao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2311.18803v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2311.18801v1","updated":"2023-11-30T18:47:18Z","published":"2023-11-30T18:47:18Z","title":"Distributed Global Structure-from-Motion with a Deep Front-End","summary":" While initial approaches to Structure-from-Motion (SfM) revolved around both\nglobal and incremental methods, most recent applications rely on incremental\nsystems to estimate camera poses due to their superior robustness. Though there\nhas been tremendous progress in SfM `front-ends' powered by deep models learned\nfrom data, the state-of-the-art (incremental) SfM pipelines still rely on\nclassical SIFT features, developed in 2004. In this work, we investigate\nwhether leveraging the developments in feature extraction and matching helps\nglobal SfM perform on par with the SOTA incremental SfM approach (COLMAP). To\ndo so, we design a modular SfM framework that allows us to easily combine\ndevelopments in different stages of the SfM pipeline. Our experiments show that\nwhile developments in deep-learning based two-view correspondence estimation do\ntranslate to improvements in point density for scenes reconstructed with global\nSfM, none of them outperform SIFT when comparing with incremental SfM results\non a range of datasets. Our SfM system is designed from the ground up to\nleverage distributed computation, enabling us to parallelize computation on\nmultiple machines and scale to large scenes.\n","authors":["Ayush Baid","John Lambert","Travis Driver","Akshay Krishnan","Hayk Stepanyan","Frank Dellaert"],"pdf_url":"https://arxiv.org/pdf/2311.18801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18799v1","updated":"2023-11-30T18:43:51Z","published":"2023-11-30T18:43:51Z","title":"X-InstructBLIP: A Framework for aligning X-Modal instruction-aware\n representations to LLMs and Emergent Cross-modal Reasoning","summary":" Vision-language pre-training and instruction tuning have demonstrated\ngeneral-purpose capabilities in 2D visual reasoning tasks by aligning visual\nencoders with state-of-the-art large language models (LLMs). In this paper, we\nintroduce a simple, yet effective, cross-modality framework built atop frozen\nLLMs that allows the integration of various modalities without extensive\nmodality-specific customization. To facilitate instruction-modality\nfine-tuning, we collect high-quality instruction tuning data in an automatic\nand scalable manner, composed of 24K QA samples for audio and 250K QA samples\nfor 3D. Leveraging instruction-aware representations, our model performs\ncomparably with leading-edge counterparts without the need of extensive\nmodality-specific pre-training or customization. Furthermore, our approach\ndemonstrates cross-modal reasoning abilities across two or more input\nmodalities, despite each modality projection being trained individually. To\nstudy the model's cross-modal abilities, we contribute a novel Discriminative\nCross-modal Reasoning (DisCRn) evaluation task, comprising 9K audio-video QA\nsamples and 28K image-3D QA samples that require the model to reason\ndiscriminatively across disparate input modalities.\n","authors":["Artemis Panagopoulou","Le Xue","Ning Yu","Junnan Li","Dongxu Li","Shafiq Joty","Ran Xu","Silvio Savarese","Caiming Xiong","Juan Carlos Niebles"],"pdf_url":"https://arxiv.org/pdf/2311.18799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18788v1","updated":"2023-11-30T18:37:21Z","published":"2023-11-30T18:37:21Z","title":"Automated interpretation of congenital heart disease from multi-view\n echocardiograms","summary":" Congenital heart disease (CHD) is the most common birth defect and the\nleading cause of neonate death in China. Clinical diagnosis can be based on the\nselected 2D key-frames from five views. Limited by the availability of\nmulti-view data, most methods have to rely on the insufficient single view\nanalysis. This study proposes to automatically analyze the multi-view\nechocardiograms with a practical end-to-end framework. We collect the five-view\nechocardiograms video records of 1308 subjects (including normal controls,\nventricular septal defect (VSD) patients and atrial septal defect (ASD)\npatients) with both disease labels and standard-view key-frame labels.\nDepthwise separable convolution-based multi-channel networks are adopted to\nlargely reduce the network parameters. We also approach the imbalanced class\nproblem by augmenting the positive training samples. Our 2D key-frame model can\ndiagnose CHD or negative samples with an accuracy of 95.4\\%, and in negative,\nVSD or ASD classification with an accuracy of 92.3\\%. To further alleviate the\nwork of key-frame selection in real-world implementation, we propose an\nadaptive soft attention scheme to directly explore the raw video data. Four\nkinds of neural aggregation methods are systematically investigated to fuse the\ninformation of an arbitrary number of frames in a video. Moreover, with a view\ndetection module, the system can work without the view records. Our video-based\nmodel can diagnose with an accuracy of 93.9\\% (binary classification), and\n92.1\\% (3-class classification) in a collected 2D video testing set, which does\nnot need key-frame selection and view annotation in testing. The detailed\nablation study and the interpretability analysis are provided.\n","authors":["Jing Wang","Xiaofeng Liu","Fangyun Wang","Lin Zheng","Fengqiao Gao","Hanwen Zhang","Xin Zhang","Wanqing Xie","Binbin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18788v1.pdf","comment":"Published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2306.17140v2","updated":"2023-11-30T18:33:12Z","published":"2023-06-29T17:41:41Z","title":"ID-Pose: Sparse-view Camera Pose Estimation by Inverting Diffusion\n Models","summary":" Given sparse views of a 3D object, estimating their camera poses is a\nlong-standing and intractable problem. Toward this goal, we consider harnessing\nthe pre-trained diffusion model of novel views conditioned on viewpoints\n(Zero-1-to-3). We present ID-Pose which inverses the denoising diffusion\nprocess to estimate the relative pose given two input images. ID-Pose adds a\nnoise to one image, and predicts the noise conditioned on the other image and a\nhypothesis of the relative pose. The prediction error is used as the\nminimization objective to find the optimal pose with the gradient descent\nmethod. We extend ID-Pose to handle more than two images and estimate each pose\nwith multiple image pairs from triangular relations. ID-Pose requires no\ntraining and generalizes to open-world images. We conduct extensive experiments\nusing casually captured photos and rendered images with random viewpoints. The\nresults demonstrate that ID-Pose significantly outperforms state-of-the-art\nmethods.\n","authors":["Weihao Cheng","Yan-Pei Cao","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2306.17140v2.pdf","comment":"Github: https://xt4d.github.io/id-pose-web/"},{"id":"http://arxiv.org/abs/2305.19480v4","updated":"2023-11-30T18:32:32Z","published":"2023-05-31T01:16:08Z","title":"Learning by Aligning 2D Skeleton Sequences in Time","summary":" This paper presents a self-supervised temporal video alignment framework\nwhich is useful for several fine-grained human activity understanding\napplications. In contrast with the state-of-the-art method of CASA, where\nsequences of 3D skeleton coordinates are taken directly as input, our key idea\nis to use sequences of 2D skeleton heatmaps as input. Unlike CASA which\nperforms self-attention in the temporal domain only, we feed 2D skeleton\nheatmaps to a video transformer which performs self-attention both in the\nspatial and temporal domains for extracting effective spatiotemporal and\ncontextual features. In addition, we introduce simple heatmap augmentation\ntechniques based on 2D skeletons for self-supervised learning. Despite the lack\nof 3D information, our approach achieves not only higher accuracy but also\nbetter robustness against missing and noisy keypoints than CASA. Furthermore,\nextensive evaluations on three public datasets, i.e., Penn Action, IKEA ASM,\nand H2O, demonstrate that our approach outperforms previous methods in\ndifferent fine-grained human activity understanding tasks. Finally, fusing 2D\nskeleton heatmaps with RGB videos yields the state-of-the-art on all metrics\nand datasets. To our best knowledge, our work is the first to utilize 2D\nskeleton heatmap inputs and the first to explore multi-modality fusion for\ntemporal video alignment.\n","authors":["Quoc-Huy Tran","Muhammad Ahmed","Murad Popattia","M. Hassan Ahmed","Andrey Konin","M. Zeeshan Zia"],"pdf_url":"https://arxiv.org/pdf/2305.19480v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14875v2","updated":"2023-11-30T18:28:43Z","published":"2023-06-26T17:44:36Z","title":"A Fully Unsupervised Instance Segmentation Technique for White Blood\n Cell Images","summary":" White blood cells, also known as leukocytes are group of heterogeneously\nnucleated cells which act as salient immune system cells. These are originated\nin the bone marrow and are found in blood, plasma, and lymph tissues.\nLeukocytes kill the bacteria, virus and other kind of pathogens which invade\nhuman body through phagocytosis that in turn results immunity. Detection of a\nwhite blood cell count can reveal camouflaged infections and warn doctors about\nchronic medical conditions such as autoimmune diseases, immune deficiencies,\nand blood disorders. Segmentation plays an important role in identification of\nwhite blood cells (WBC) from microscopic image analysis. The goal of\nsegmentation in a microscopic image is to divide the image into different\ndistinct regions. In our paper, we tried to propose a novel instance\nsegmentation method for segmenting the WBCs containing both the nucleus and the\ncytoplasm, from bone marrow images.\n","authors":["Shrijeet Biswas","Amartya Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2306.14875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18775v1","updated":"2023-11-30T18:21:25Z","published":"2023-11-30T18:21:25Z","title":"CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation","summary":" We present CoDi-2, a versatile and interactive Multimodal Large Language\nModel (MLLM) that can follow complex multimodal interleaved instructions,\nconduct in-context learning (ICL), reason, chat, edit, etc., in an any-to-any\ninput-output modality paradigm. By aligning modalities with language for both\nencoding and generation, CoDi-2 empowers Large Language Models (LLMs) to not\nonly understand complex modality-interleaved instructions and in-context\nexamples, but also autoregressively generate grounded and coherent multimodal\noutputs in the continuous feature space. To train CoDi-2, we build a\nlarge-scale generation dataset encompassing in-context multimodal instructions\nacross text, vision, and audio. CoDi-2 demonstrates a wide range of zero-shot\ncapabilities for multimodal generation, such as in-context learning, reasoning,\nand compositionality of any-to-any modality generation through multi-round\ninteractive conversation. CoDi-2 surpasses previous domain-specific models on\ntasks such as subject-driven image generation, vision transformation, and audio\nediting. CoDi-2 signifies a substantial breakthrough in developing a\ncomprehensive multimodal foundation model adept at interpreting in-context\nlanguage-vision-audio interleaved instructions and producing multimodal\noutputs.\n","authors":["Zineng Tang","Ziyi Yang","Mahmoud Khademi","Yang Liu","Chenguang Zhu","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2311.18775v1.pdf","comment":"Project Page: https://codi-2.github.io/"},{"id":"http://arxiv.org/abs/2311.18773v1","updated":"2023-11-30T18:19:23Z","published":"2023-11-30T18:19:23Z","title":"Spacewalk-18: A Benchmark for Multimodal and Long-form Procedural Video\n Understanding in Novel Domains","summary":" Learning from videos is an emerging research area that enables robots to\nacquire skills from human demonstrations, such as procedural videos. To do\nthis, video-language models must be able to obtain structured understandings,\nsuch as the temporal segmentation of a demonstration into sequences of actions\nand skills, and to generalize the understandings to novel domains. In pursuit\nof this goal, we introduce Spacewalk-18, a benchmark containing two tasks: (1)\nstep recognition and (2) intra-video retrieval over a dataset of temporally\nsegmented and labeled tasks in International Space Station spacewalk\nrecordings. In tandem, the two tasks quantify a model's ability to make use of:\n(1) out-of-domain visual information; (2) a high temporal context window; and\n(3) multimodal (text + video) domains. This departs from existing benchmarks\nfor procedural video understanding, which typically deal with short context\nlengths and can be solved with a single modality. Spacewalk-18, with its\ninherent multimodal and long-form complexity, exposes the high difficulty of\ntask recognition and segmentation. We find that state-of-the-art methods\nperform poorly on our benchmark, demonstrating that the goal of generalizable\nprocedural video understanding models is far out and underscoring the need to\ndevelop new approaches to these tasks. Data, model, and code will be publicly\nreleased.\n","authors":["Rohan Myer Krishnan","Zitian Tang","Zhiqiu Yu","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2311.18773v1.pdf","comment":"Under submission. Code and models will be released at\n https://brown-palm.github.io/Spacewalk-18/"},{"id":"http://arxiv.org/abs/2311.14837v2","updated":"2023-11-30T18:14:48Z","published":"2023-11-24T20:16:38Z","title":"Benchmarking Robustness of Text-Image Composed Retrieval","summary":" Text-image composed retrieval aims to retrieve the target image through the\ncomposed query, which is specified in the form of an image plus some text that\ndescribes desired modifications to the input image. It has recently attracted\nattention due to its ability to leverage both information-rich images and\nconcise language to precisely express the requirements for target images.\nHowever, the robustness of these approaches against real-world corruptions or\nfurther text understanding has never been studied. In this paper, we perform\nthe first robustness study and establish three new diversified benchmarks for\nsystematic analysis of text-image composed retrieval against natural\ncorruptions in both vision and text and further probe textural understanding.\nFor natural corruption analysis, we introduce two new large-scale benchmark\ndatasets, CIRR-C and FashionIQ-C for testing in open domain and fashion domain\nrespectively, both of which apply 15 visual corruptions and 7 textural\ncorruptions. For textural understanding analysis, we introduce a new diagnostic\ndataset CIRR-D by expanding the original raw data with synthetic data, which\ncontains modified text to better probe textual understanding ability including\nnumerical variation, attribute variation, object removal, background variation,\nand fine-grained evaluation. The code and benchmark datasets are available at\nhttps://github.com/SunTongtongtong/Benchmark-Robustness-Text-Image-Compose-Retrieval.\n","authors":["Shitong Sun","Jindong Gu","Shaogang Gong"],"pdf_url":"https://arxiv.org/pdf/2311.14837v2.pdf","comment":"Accepted by R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot\n Learning in Foundation Models at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.18765v1","updated":"2023-11-30T18:05:52Z","published":"2023-11-30T18:05:52Z","title":"MLLMs-Augmented Visual-Language Representation Learning","summary":" Visual-language pre-training (VLP) have achieved remarkable success in\nmulti-modal tasks, largely attributed to the availability of large-scale\nimage-text datasets. In this work, we demonstrate that multi-modal large\nlanguage models (MLLMs) can enhance visual-language representation learning by\nimproving data quality. Our approach is simple, utilizing MLLMs to extend\nmultiple captions for each image. To prevent the bias that introduced by MLLMs'\nhallucinations and intrinsic caption styles, we propose a \"text shearing\" to\nkeep the lengths of extended captions identical to the originals. In image-text\nretrieval, our method consistently obtains 5.6 ~ 35.0% and 16.8 ~ 46.1%\nimprovement on R@1 under the fine-tuning and zero-shot settings, respectively.\nNotably, our zero-shot results are comparable to fine-tuning on target\ndatasets, which encourages more exploration on the versatile use of MLLMs.\n","authors":["Yanqing Liu","Kai Wang","Wenqi Shao","Ping Luo","Yu Qiao","Mike Zheng Shou","Kaipeng Zhang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2311.18765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.06799v4","updated":"2023-11-30T18:05:07Z","published":"2022-07-14T10:23:17Z","title":"MMOTU: A Multi-Modality Ovarian Tumor Ultrasound Image Dataset for\n Unsupervised Cross-Domain Semantic Segmentation","summary":" Ovarian cancer is one of the most harmful gynecological diseases. Detecting\novarian tumors in early stage with computer-aided techniques can efficiently\ndecrease the mortality rate. With the improvement of medical treatment\nstandard, ultrasound images are widely applied in clinical treatment. However,\nrecent notable methods mainly focus on single-modality ultrasound ovarian tumor\nsegmentation or recognition, which means there still lacks researches on\nexploring the representation capability of multi-modality ultrasound ovarian\ntumor images. To solve this problem, we propose a Multi-Modality Ovarian Tumor\nUltrasound (MMOTU) image dataset containing 1469 2d ultrasound images and 170\ncontrast enhanced ultrasonography (CEUS) images with pixel-wise and global-wise\nannotations. Based on MMOTU, we mainly focus on unsupervised cross-domain\nsemantic segmentation task. To solve the domain shift problem, we propose a\nfeature alignment based architecture named Dual-Scheme Domain-Selected Network\n(DS2Net). Specifically, we first design source-encoder and target-encoder to\nextract two-style features of source and target images. Then, we propose\nDomain-Distinct Selected Module (DDSM) and Domain-Universal Selected Module\n(DUSM) to extract the distinct and universal features in two styles\n(source-style or target-style). Finally, we fuse these two kinds of features\nand feed them into the source-decoder and target-decoder to generate final\npredictions. Extensive comparison experiments and analysis on MMOTU image\ndataset show that DS2Net can boost the segmentation performance for\nbidirectional cross-domain adaptation of 2d ultrasound images and CEUS images.\nOur proposed dataset and code are all available at\nhttps://github.com/cv516Buaa/MMOTU_DS2Net.\n","authors":["Qi Zhao","Shuchang Lyu","Wenpei Bai","Linghan Cai","Binghao Liu","Guangliang Cheng","Meijing Wu","Xiubo Sang","Min Yang","Lijiang Chen"],"pdf_url":"https://arxiv.org/pdf/2207.06799v4.pdf","comment":"code: https://github.com/cv516Buaa/MMOTU_DS2Net paper:18 pages, 12\n figures, 11 tables, 16 formulas"},{"id":"http://arxiv.org/abs/2311.18763v1","updated":"2023-11-30T18:04:21Z","published":"2023-11-30T18:04:21Z","title":"Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters","summary":" Recent work has demonstrated a remarkable ability to customize text-to-image\ndiffusion models to multiple, fine-grained concepts in a sequential (i.e.,\ncontinual) manner while only providing a few example images for each concept.\nThis setting is known as continual diffusion. Here, we ask the question: Can we\nscale these methods to longer concept sequences without forgetting? Although\nprior work mitigates the forgetting of previously learned concepts, we show\nthat its capacity to learn new tasks reaches saturation over longer sequences.\nWe address this challenge by introducing a novel method, STack-And-Mask\nINcremental Adapters (STAMINA), which is composed of low-ranked\nattention-masked adapters and customized MLP tokens. STAMINA is designed to\nenhance the robust fine-tuning properties of LoRA for sequential concept\nlearning via learnable hard-attention masks parameterized with low rank MLPs,\nenabling precise, scalable learning via sparse adaptation. Notably, all\nintroduced trainable parameters can be folded back into the model after\ntraining, inducing no additional inference parameter costs. We show that\nSTAMINA outperforms the prior SOTA for the setting of text-to-image continual\ncustomization on a 50-concept benchmark composed of landmarks and human faces,\nwith no stored replay data. Additionally, we extended our method to the setting\nof continual learning for image classification, demonstrating that our gains\nalso translate to state-of-the-art performance in this standard benchmark.\n","authors":["James Seale Smith","Yen-Chang Hsu","Zsolt Kira","Yilin Shen","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2311.18763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18758v1","updated":"2023-11-30T18:01:03Z","published":"2023-11-30T18:01:03Z","title":"Semi-supervised Semantic Segmentation via Boosting Uncertainty on\n Unlabeled Data","summary":" We bring a new perspective to semi-supervised semantic segmentation by\nproviding an analysis on the labeled and unlabeled distributions in training\ndatasets. We first figure out that the distribution gap between labeled and\nunlabeled datasets cannot be ignored, even though the two datasets are sampled\nfrom the same distribution. To address this issue, we theoretically analyze and\nexperimentally prove that appropriately boosting uncertainty on unlabeled data\ncan help minimize the distribution gap, which benefits the generalization of\nthe model. We propose two strategies and design an uncertainty booster\nalgorithm, specially for semi-supervised semantic segmentation. Extensive\nexperiments are carried out based on these theories, and the results confirm\nthe efficacy of the algorithm and strategies. Our plug-and-play uncertainty\nbooster is tiny, efficient, and robust to hyperparameters but can significantly\npromote performance. Our approach achieves state-of-the-art performance in our\nexperiments compared to the current semi-supervised semantic segmentation\nmethods on the popular benchmarks: Cityscapes and PASCAL VOC 2012 with\ndifferent train settings.\n","authors":["Daoan Zhang","Yunhao Luo","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15841v2","updated":"2023-11-30T17:51:47Z","published":"2023-11-27T14:07:13Z","title":"Learning Disentangled Identifiers for Action-Customized Text-to-Image\n Generation","summary":" This study focuses on a novel task in text-to-image (T2I) generation, namely\naction customization. The objective of this task is to learn the co-existing\naction from limited data and generalize it to unseen humans or even animals.\nExperimental results show that existing subject-driven customization methods\nfail to learn the representative characteristics of actions and struggle in\ndecoupling actions from context features, including appearance. To overcome the\npreference for low-level features and the entanglement of high-level features,\nwe propose an inversion-based method Action-Disentangled Identifier (ADI) to\nlearn action-specific identifiers from the exemplar images. ADI first expands\nthe semantic conditioning space by introducing layer-wise identifier tokens,\nthereby increasing the representational richness while distributing the\ninversion across different features. Then, to block the inversion of\naction-agnostic features, ADI extracts the gradient invariance from the\nconstructed sample triples and masks the updates of irrelevant channels. To\ncomprehensively evaluate the task, we present an ActionBench that includes a\nvariety of actions, each accompanied by meticulously selected samples. Both\nquantitative and qualitative results show that our ADI outperforms existing\nbaselines in action-customized T2I generation. Our project page is at\nhttps://adi-t2i.github.io/ADI.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Xi Chen","Yuqian Fu","Yu Liu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15841v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00916v3","updated":"2023-11-30T17:40:51Z","published":"2023-04-03T12:11:51Z","title":"DreamAvatar: Text-and-Shape Guided 3D Human Avatar Generation via\n Diffusion Models","summary":" We present DreamAvatar, a text-and-shape guided framework for generating\nhigh-quality 3D human avatars with controllable poses. While encouraging\nresults have been reported by recent methods on text-guided 3D common object\ngeneration, generating high-quality human avatars remains an open challenge due\nto the complexity of the human body's shape, pose, and appearance. We propose\nDreamAvatar to tackle this challenge, which utilizes a trainable NeRF for\npredicting density and color for 3D points and pretrained text-to-image\ndiffusion models for providing 2D self-supervision. Specifically, we leverage\nthe SMPL model to provide shape and pose guidance for the generation. We\nintroduce a dual-observation-space design that involves the joint optimization\nof a canonical space and a posed space that are related by a learnable\ndeformation field. This facilitates the generation of more complete textures\nand geometry faithful to the target pose. We also jointly optimize the losses\ncomputed from the full body and from the zoomed-in 3D head to alleviate the\ncommon multi-face ''Janus'' problem and improve facial details in the generated\navatars. Extensive evaluations demonstrate that DreamAvatar significantly\noutperforms existing methods, establishing a new state-of-the-art for\ntext-and-shape guided 3D human avatar generation.\n","authors":["Yukang Cao","Yan-Pei Cao","Kai Han","Ying Shan","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2304.00916v3.pdf","comment":"Project page: https://yukangcao.github.io/DreamAvatar/"},{"id":"http://arxiv.org/abs/2311.18729v1","updated":"2023-11-30T17:26:33Z","published":"2023-11-30T17:26:33Z","title":"Learning One-Shot 4D Head Avatar Synthesis using Synthetic Data","summary":" Existing one-shot 4D head synthesis methods usually learn from monocular\nvideos with the aid of 3DMM reconstruction, yet the latter is evenly\nchallenging which restricts them from reasonable 4D head synthesis. We present\na method to learn one-shot 4D head synthesis via large-scale synthetic data.\nThe key is to first learn a part-wise 4D generative model from monocular images\nvia adversarial learning, to synthesize multi-view images of diverse identities\nand full motions as training data; then leverage a transformer-based animatable\ntriplane reconstructor to learn 4D head reconstruction using the synthetic\ndata. A novel learning strategy is enforced to enhance the generalizability to\nreal images by disentangling the learning process of 3D reconstruction and\nreenactment. Experiments demonstrate our superiority over the prior art.\n","authors":["Yu Deng","Duomin Wang","Xiaohang Ren","Xingyu Chen","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18729v1.pdf","comment":"Project page: https://yudeng.github.io/Portrait4D/"},{"id":"http://arxiv.org/abs/2311.18710v1","updated":"2023-11-30T17:02:27Z","published":"2023-11-30T17:02:27Z","title":"Meta-Prior: Meta learning for Adaptive Inverse Problem Solvers","summary":" Deep neural networks have become a foundational tool for addressing imaging\ninverse problems. They are typically trained for a specific task, with a\nsupervised loss to learn a mapping from the observations to the image to\nrecover. However, real-world imaging challenges often lack ground truth data,\nrendering traditional supervised approaches ineffective. Moreover, for each new\nimaging task, a new model needs to be trained from scratch, wasting time and\nresources. To overcome these limitations, we introduce a novel approach based\non meta-learning. Our method trains a meta-model on a diverse set of imaging\ntasks that allows the model to be efficiently fine-tuned for specific tasks\nwith few fine-tuning steps. We show that the proposed method extends to the\nunsupervised setting, where no ground truth data is available. In its bilevel\nformulation, the outer level uses a supervised loss, that evaluates how well\nthe fine-tuned model performs, while the inner loss can be either supervised or\nunsupervised, relying only on the measurement operator. This allows the\nmeta-model to leverage a few ground truth samples for each task while being\nable to generalize to new imaging tasks. We show that in simple settings, this\napproach recovers the Bayes optimal estimator, illustrating the soundness of\nour approach. We also demonstrate our method's effectiveness on various tasks,\nincluding image processing and magnetic resonance imaging.\n","authors":["Matthieu Terris","Thomas Moreau"],"pdf_url":"https://arxiv.org/pdf/2311.18710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18695v1","updated":"2023-11-30T16:42:24Z","published":"2023-11-30T16:42:24Z","title":"Seg2Reg: Differentiable 2D Segmentation to 1D Regression Rendering for\n 360 Room Layout Reconstruction","summary":" State-of-the-art single-view 360-degree room layout reconstruction methods\nformulate the problem as a high-level 1D (per-column) regression task. On the\nother hand, traditional low-level 2D layout segmentation is simpler to learn\nand can represent occluded regions, but it requires complex post-processing for\nthe targeting layout polygon and sacrifices accuracy. We present Seg2Reg to\nrender 1D layout depth regression from the 2D segmentation map in a\ndifferentiable and occlusion-aware way, marrying the merits of both sides.\nSpecifically, our model predicts floor-plan density for the input\nequirectangular 360-degree image. Formulating the 2D layout representation as a\ndensity field enables us to employ `flattened' volume rendering to form 1D\nlayout depth regression. In addition, we propose a novel 3D warping\naugmentation on layout to improve generalization. Finally, we re-implement\nrecent room layout reconstruction methods into our codebase for benchmarking\nand explore modern backbones and training techniques to serve as the strong\nbaseline. Our model significantly outperforms previous arts. The code will be\nmade available upon publication.\n","authors":["Cheng Sun","Wei-En Tai","Yu-Lin Shih","Kuan-Wei Chen","Yong-Jing Syu","Kent Selwyn The","Yu-Chiang Frank Wang","Hwann-Tzong Chen"],"pdf_url":"https://arxiv.org/pdf/2311.18695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18681v1","updated":"2023-11-30T16:28:40Z","published":"2023-11-30T16:28:40Z","title":"RaDialog: A Large Vision-Language Model for Radiology Report Generation\n and Conversational Assistance","summary":" Conversational AI tools that can generate and discuss clinically correct\nradiology reports for a given medical image have the potential to transform\nradiology. Such a human-in-the-loop radiology assistant could facilitate a\ncollaborative diagnostic process, thus saving time and improving the quality of\nreports. Towards this goal, we introduce RaDialog, the first thoroughly\nevaluated and publicly available large vision-language model for radiology\nreport generation and interactive dialog. RaDialog effectively integrates\nvisual image features and structured pathology findings with a large language\nmodel (LLM) while simultaneously adapting it to a specialized domain using\nparameter-efficient fine-tuning. To keep the conversational abilities of the\nunderlying LLM, we propose a comprehensive, semi-automatically labeled,\nimage-grounded instruct dataset for chest X-ray radiology tasks. By training\nwith this dataset, our method achieves state-of-the-art clinical correctness in\nreport generation and shows impressive abilities in interactive tasks such as\ncorrecting reports and answering questions, serving as a foundational step\ntoward clinical dialog systems. Our code is available on github:\nhttps://github.com/ChantalMP/RaDialog.\n","authors":["Chantal Pellegrini","Ege Özsoy","Benjamin Busam","Nassir Navab","Matthias Keicher"],"pdf_url":"https://arxiv.org/pdf/2311.18681v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.18675v1","updated":"2023-11-30T16:20:54Z","published":"2023-11-30T16:20:54Z","title":"Cascaded Interaction with Eroded Deep Supervision for Salient Object\n Detection","summary":" Deep convolutional neural networks have been widely applied in salient object\ndetection and have achieved remarkable results in this field. However, existing\nmodels suffer from information distortion caused by interpolation during\nup-sampling and down-sampling. In response to this drawback, this article\nstarts from two directions in the network: feature and label. On the one hand,\na novel cascaded interaction network with a guidance module named global-local\naligned attention (GAA) is designed to reduce the negative impact of\ninterpolation on the feature side. On the other hand, a deep supervision\nstrategy based on edge erosion is proposed to reduce the negative guidance of\nlabel interpolation on lateral output. Extensive experiments on five popular\ndatasets demonstrate the superiority of our method.\n","authors":["Hewen Xiao","Jie Mei","Guangfu Ma","Weiren Wu"],"pdf_url":"https://arxiv.org/pdf/2311.18675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18666v1","updated":"2023-11-30T16:15:46Z","published":"2023-11-30T16:15:46Z","title":"Action Recognition in Video Recordings from Gynecologic Laparoscopy","summary":" Action recognition is a prerequisite for many applications in laparoscopic\nvideo analysis including but not limited to surgical training, operation room\nplanning, follow-up surgery preparation, post-operative surgical assessment,\nand surgical outcome estimation. However, automatic action recognition in\nlaparoscopic surgeries involves numerous challenges such as (I) cross-action\nand intra-action duration variation, (II) relevant content distortion due to\nsmoke, blood accumulation, fast camera motions, organ movements, object\nocclusion, and (III) surgical scene variations due to different illuminations\nand viewpoints. Besides, action annotations in laparoscopy surgeries are\nlimited and expensive due to requiring expert knowledge. In this study, we\ndesign and evaluate a CNN-RNN architecture as well as a customized\ntraining-inference framework to deal with the mentioned challenges in\nlaparoscopic surgery action recognition. Using stacked recurrent layers, our\nproposed network takes advantage of inter-frame dependencies to negate the\nnegative effect of content distortion and variation in action recognition.\nFurthermore, our proposed frame sampling strategy effectively manages the\nduration variations in surgical actions to enable action recognition with high\ntemporal resolution. Our extensive experiments confirm the superiority of our\nproposed method in action recognition compared to static CNNs.\n","authors":["Sahar Nasirihaghighi","Negin Ghamsarian","Daniela Stefanics","Klaus Schoeffmann","Heinrich Husslein"],"pdf_url":"https://arxiv.org/pdf/2311.18666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18665v1","updated":"2023-11-30T16:15:29Z","published":"2023-11-30T16:15:29Z","title":"Pose Estimation and Tracking for ASIST","summary":" Aircraft Ship Integrated Secure and Traverse (ASIST) is a system designed to\narrest helicopters safely and efficiently on ships. Originally, a precision\nHelicopter Position Sensing Equipment (HPSE) tracked and monitored the position\nof the helicopter relative to the Rapid Securing Device (RSD). However, using\nthe HPSE component was determined to be infeasible in the transition of the\nASIST system due to the hardware installation requirements. As a result,\nsailors track the position of the helicopters with their eyes with no sensor or\nartificially intelligent decision aid. Manually tracking the helicopter takes\nadditional time and makes recoveries more difficult, especially at high sea\nstates. Performing recoveries without the decision aid leads to higher\nuncertainty and cognitive load. PETA (Pose Estimation and Tracking for ASIST)\nis a research effort to create a helicopter tracking system prototype without\nhardware installation requirements for ASIST system operators. Its overall goal\nis to improve situational awareness and reduce operator uncertainty with\nrespect to the aircrafts position relative to the RSD, and consequently\nincrease the allowable landing area. The authors produced a prototype system\ncapable of tracking helicopters with respect to the RSD. The software included\na helicopter pose estimation component, camera pose estimation component, and a\nuser interface component. PETA demonstrated the potential for state-of-the-art\ncomputer vision algorithms Faster R-CNN and HRNet (High-Resolution Network) to\nbe used to estimate the pose of helicopters in real-time, returning ASIST to\nits originally intended capability. PETA also demonstrated that traditional\nmethods of encoder-decoders could be used to estimate the orientation of the\nhelicopter and could be used to confirm the output from HRNet.\n","authors":["Ari Goodman","Gurpreet Singh","Ryan O'Shea","Peter Teague","James Hing"],"pdf_url":"https://arxiv.org/pdf/2311.18665v1.pdf","comment":"7 pages, 8 figures. Published in the Proceedings of the ASNE 2023\n Technology, Systems & Ships Symposium. Reproduced with permission from the\n American Society of Naval Engineers. Distribution Statement A: Approved for\n public release; distribution is unlimited, as submitted under NAVAIR Public\n Release Authorization 2023-018"},{"id":"http://arxiv.org/abs/2311.18664v1","updated":"2023-11-30T16:13:17Z","published":"2023-11-30T16:13:17Z","title":"Multi-task learning with cross-task consistency for improved depth\n estimation in colonoscopy","summary":" Colonoscopy screening is the gold standard procedure for assessing\nabnormalities in the colon and rectum, such as ulcers and cancerous polyps.\nMeasuring the abnormal mucosal area and its 3D reconstruction can help quantify\nthe surveyed area and objectively evaluate disease burden. However, due to the\ncomplex topology of these organs and variable physical conditions, for example,\nlighting, large homogeneous texture, and image modality estimating distance\nfrom the camera aka depth) is highly challenging. Moreover, most colonoscopic\nvideo acquisition is monocular, making the depth estimation a non-trivial\nproblem. While methods in computer vision for depth estimation have been\nproposed and advanced on natural scene datasets, the efficacy of these\ntechniques has not been widely quantified on colonoscopy datasets. As the\ncolonic mucosa has several low-texture regions that are not well pronounced,\nlearning representations from an auxiliary task can improve salient feature\nextraction, allowing estimation of accurate camera depths. In this work, we\npropose to develop a novel multi-task learning (MTL) approach with a shared\nencoder and two decoders, namely a surface normal decoder and a depth estimator\ndecoder. Our depth estimator incorporates attention mechanisms to enhance\nglobal context awareness. We leverage the surface normal prediction to improve\ngeometric feature extraction. Also, we apply a cross-task consistency loss\namong the two geometrically related tasks, surface normal and camera depth. We\ndemonstrate an improvement of 14.17% on relative error and 10.4% improvement on\n$\\delta_{1}$ accuracy over the most accurate baseline state-of-the-art BTS\napproach. All experiments are conducted on a recently released C3VD dataset;\nthus, we provide a first benchmark of state-of-the-art methods.\n","authors":["Pedro Esteban Chavarrias Solano","Andrew Bulpitt","Venkataraman Subramanian","Sharib Ali"],"pdf_url":"https://arxiv.org/pdf/2311.18664v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2311.18661v1","updated":"2023-11-30T16:10:04Z","published":"2023-11-30T16:10:04Z","title":"Learning Part Segmentation from Synthetic Animals","summary":" Semantic part segmentation provides an intricate and interpretable\nunderstanding of an object, thereby benefiting numerous downstream tasks.\nHowever, the need for exhaustive annotations impedes its usage across diverse\nobject types. This paper focuses on learning part segmentation from synthetic\nanimals, leveraging the Skinned Multi-Animal Linear (SMAL) models to scale up\nexisting synthetic data generated by computer-aided design (CAD) animal models.\nCompared to CAD models, SMAL models generate data with a wider range of poses\nobserved in real-world scenarios. As a result, our first contribution is to\nconstruct a synthetic animal dataset of tigers and horses with more pose\ndiversity, termed Synthetic Animal Parts (SAP). We then benchmark Syn-to-Real\nanimal part segmentation from SAP to PartImageNet, namely SynRealPart, with\nexisting semantic segmentation domain adaptation methods and further improve\nthem as our second contribution. Concretely, we examine three Syn-to-Real\nadaptation methods but observe relative performance drop due to the innate\ndifference between the two tasks. To address this, we propose a simple yet\neffective method called Class-Balanced Fourier Data Mixing (CB-FDM). Fourier\nData Mixing aligns the spectral amplitudes of synthetic images with real\nimages, thereby making the mixed images have more similar frequency content to\nreal images. We further use Class-Balanced Pseudo-Label Re-Weighting to\nalleviate the imbalanced class distribution. We demonstrate the efficacy of\nCB-FDM on SynRealPart over previous methods with significant performance\nimprovements. Remarkably, our third contribution is to reveal that the learned\nparts from synthetic tiger and horse are transferable across all quadrupeds in\nPartImageNet, further underscoring the utility and potential applications of\nanimal part segmentation.\n","authors":["Jiawei Peng","Ju He","Prakhar Kaushik","Zihao Xiao","Jiteng Mu","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2311.18661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18654v1","updated":"2023-11-30T16:04:30Z","published":"2023-11-30T16:04:30Z","title":"Detailed Human-Centric Text Description-Driven Large Scene Synthesis","summary":" Text-driven large scene image synthesis has made significant progress with\ndiffusion models, but controlling it is challenging. While using additional\nspatial controls with corresponding texts has improved the controllability of\nlarge scene synthesis, it is still challenging to faithfully reflect detailed\ntext descriptions without user-provided controls. Here, we propose\nDetText2Scene, a novel text-driven large-scale image synthesis with high\nfaithfulness, controllability, and naturalness in a global context for the\ndetailed human-centric text description. Our DetText2Scene consists of 1)\nhierarchical keypoint-box layout generation from the detailed description by\nleveraging large language model (LLM), 2) view-wise conditioned joint diffusion\nprocess to synthesize a large scene from the given detailed text with\nLLM-generated grounded keypoint-box layout and 3) pixel perturbation-based\npyramidal interpolation to progressively refine the large scene for global\ncoherence. Our DetText2Scene significantly outperforms prior arts in\ntext-to-large scene synthesis qualitatively and quantitatively, demonstrating\nstrong faithfulness with detailed descriptions, superior controllability, and\nexcellent naturalness in a global context.\n","authors":["Gwanghyun Kim","Dong Un Kang","Hoigi Seo","Hayeon Kim","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2311.18654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18651v1","updated":"2023-11-30T16:00:23Z","published":"2023-11-30T16:00:23Z","title":"LL3DA: Visual Interactive Instruction Tuning for Omni-3D Understanding,\n Reasoning, and Planning","summary":" Recent advances in Large Multimodal Models (LMM) have made it possible for\nvarious applications in human-machine interactions. However, developing LMMs\nthat can comprehend, reason, and plan in complex and diverse 3D environments\nremains a challenging topic, especially considering the demand for\nunderstanding permutation-invariant point cloud 3D representations of the 3D\nscene. Existing works seek help from multi-view images, and project 2D features\nto 3D space as 3D scene representations. This, however, leads to huge\ncomputational overhead and performance degradation. In this paper, we present\nLL3DA, a Large Language 3D Assistant that takes point cloud as direct input and\nrespond to both textual-instructions and visual-prompts. This help LMMs better\ncomprehend human interactions and further help to remove the ambiguities in\ncluttered 3D scenes. Experiments show that LL3DA achieves remarkable results,\nand surpasses various 3D vision-language models on both 3D Dense Captioning and\n3D Question Answering.\n","authors":["Sijin Chen","Xin Chen","Chi Zhang","Mingsheng Li","Gang Yu","Hao Fei","Hongyuan Zhu","Jiayuan Fan","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2311.18651v1.pdf","comment":"Project Page: https://ll3da.github.io/"},{"id":"http://arxiv.org/abs/2311.18649v1","updated":"2023-11-30T15:57:34Z","published":"2023-11-30T15:57:34Z","title":"Simple Semantic-Aided Few-Shot Learning","summary":" Learning from a limited amount of data, namely Few-Shot Learning, stands out\nas a challenging computer vision task. Several works exploit semantics and\ndesign complicated semantic fusion mechanisms to compensate for rare\nrepresentative features within restricted data. However, relying on naive\nsemantics such as class names introduces biases due to their brevity, while\nacquiring extensive semantics from external knowledge takes a huge time and\neffort. This limitation severely constrains the potential of semantics in\nfew-shot learning. In this paper, we design an automatic way called Semantic\nEvolution to generate high-quality semantics. The incorporation of high-quality\nsemantics alleviates the need for complex network structures and learning\nalgorithms used in previous works. Hence, we employ a simple two-layer network\ntermed Semantic Alignment Network to transform semantics and visual features\ninto robust class prototypes with rich discriminative features for few-shot\nclassification. The experimental results show our framework outperforms all\nprevious methods on five benchmarks, demonstrating a simple network with\nhigh-quality semantics can beat intricate multi-modal modules on few-shot\nclassification tasks.\n","authors":["Hai Zhang","Junzhe Xu","Shanlin Jiang","Zhenan He"],"pdf_url":"https://arxiv.org/pdf/2311.18649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18645v1","updated":"2023-11-30T15:53:37Z","published":"2023-11-30T15:53:37Z","title":"Stochastic Vision Transformers with Wasserstein Distance-Aware Attention","summary":" Self-supervised learning is one of the most promising approaches to acquiring\nknowledge from limited labeled data. Despite the substantial advancements made\nin recent years, self-supervised models have posed a challenge to\npractitioners, as they do not readily provide insight into the model's\nconfidence and uncertainty. Tackling this issue is no simple feat, primarily\ndue to the complexity involved in implementing techniques that can make use of\nthe latent representations learned during pre-training without relying on\nexplicit labels. Motivated by this, we introduce a new stochastic vision\ntransformer that integrates uncertainty and distance awareness into\nself-supervised learning (SSL) pipelines. Instead of the conventional\ndeterministic vector embedding, our novel stochastic vision transformer encodes\nimage patches into elliptical Gaussian distributional embeddings. Notably, the\nattention matrices of these stochastic representational embeddings are computed\nusing Wasserstein distance-based attention, effectively capitalizing on the\ndistributional nature of these embeddings. Additionally, we propose a\nregularization term based on Wasserstein distance for both pre-training and\nfine-tuning processes, thereby incorporating distance awareness into latent\nrepresentations. We perform extensive experiments across different tasks such\nas in-distribution generalization, out-of-distribution detection, dataset\ncorruption, semi-supervised settings, and transfer learning to other datasets\nand tasks. Our proposed method achieves superior accuracy and calibration,\nsurpassing the self-supervised baseline in a wide range of experiments on a\nvariety of datasets.\n","authors":["Franciskus Xaverius Erick","Mina Rezaei","Johanna Paula Müller","Bernhard Kainz"],"pdf_url":"https://arxiv.org/pdf/2311.18645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12001v2","updated":"2023-11-30T15:53:00Z","published":"2023-03-21T16:33:40Z","title":"ViC-MAE: Self-Supervised Representation Learning from Images and Video\n with Contrastive Masked Autoencoders","summary":" We propose ViC-MAE, a model that combines both Masked AutoEncoders (MAE) and\ncontrastive learning. ViC-MAE is trained using a global featured obtained by\npooling the local representations learned under an MAE reconstruction loss and\nleveraging this representation under a contrastive objective across images and\nvideo frames. We show that visual representations learned under ViC-MAE\ngeneralize well to both video and image classification tasks. Particularly,\nViC-MAE obtains state-of-the-art transfer learning performance from video to\nimages on Imagenet-1k compared to the recently proposed OmniMAE by achieving a\ntop-1 accuracy of 86% (+1.3% absolute improvement) when trained on the same\ndata and 87.1% (+2.4% absolute improvement) when training on extra data. At the\nsame time ViC-MAE outperforms most other methods on video benchmarks by\nobtaining 75.9% top-1 accuracy on the challenging Something something-v2 video\nbenchmark . When training on videos and images from a diverse combination of\ndatasets, our method maintains a balanced transfer-learning performance between\nvideo and image classification benchmarks, coming only as a close second to the\nbest supervised method.\n","authors":["Jefferson Hernandez","Ruben Villegas","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2303.12001v2.pdf","comment":"More results on Video an Image datasets, ViC-MAE now supports\n training on videos and images"},{"id":"http://arxiv.org/abs/2311.18635v1","updated":"2023-11-30T15:43:13Z","published":"2023-11-30T15:43:13Z","title":"DiffusionAvatars: Deferred Diffusion for High-fidelity 3D Head Avatars","summary":" DiffusionAvatars synthesizes a high-fidelity 3D head avatar of a person,\noffering intuitive control over both pose and expression. We propose a\ndiffusion-based neural renderer that leverages generic 2D priors to produce\ncompelling images of faces. For coarse guidance of the expression and head\npose, we render a neural parametric head model (NPHM) from the target\nviewpoint, which acts as a proxy geometry of the person. Additionally, to\nenhance the modeling of intricate facial expressions, we condition\nDiffusionAvatars directly on the expression codes obtained from NPHM via\ncross-attention. Finally, to synthesize consistent surface details across\ndifferent viewpoints and expressions, we rig learnable spatial features to the\nhead's surface via TriPlane lookup in NPHM's canonical space. We train\nDiffusionAvatars on RGB videos and corresponding tracked NPHM meshes of a\nperson and test the obtained avatars in both self-reenactment and animation\nscenarios. Our experiments demonstrate that DiffusionAvatars generates\ntemporally consistent and visually appealing videos for novel poses and\nexpressions of a person, outperforming existing approaches.\n","authors":["Tobias Kirschstein","Simon Giebenhain","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2311.18635v1.pdf","comment":"Project Page: https://tobias-kirschstein.github.io/diffusion-avatars/\n , Video: https://youtu.be/nSjDiiTnp2E"},{"id":"http://arxiv.org/abs/2303.00566v2","updated":"2023-11-30T15:39:30Z","published":"2023-03-01T15:12:55Z","title":"Structured Pruning for Deep Convolutional Neural Networks: A survey","summary":" The remarkable performance of deep Convolutional neural networks (CNNs) is\ngenerally attributed to their deeper and wider architectures, which can come\nwith significant computational costs. Pruning neural networks has thus gained\ninterest since it effectively lowers storage and computational costs. In\ncontrast to weight pruning, which results in unstructured models, structured\npruning provides the benefit of realistic acceleration by producing models that\nare friendly to hardware implementation. The special requirements of structured\npruning have led to the discovery of numerous new challenges and the\ndevelopment of innovative solutions. This article surveys the recent progress\ntowards structured pruning of deep CNNs. We summarize and compare the\nstate-of-the-art structured pruning techniques with respect to filter ranking\nmethods, regularization methods, dynamic execution, neural architecture search,\nthe lottery ticket hypothesis, and the applications of pruning. While\ndiscussing structured pruning algorithms, we briefly introduce the unstructured\npruning counterpart to emphasize their differences. Furthermore, we provide\ninsights into potential research opportunities in the field of structured\npruning. A curated list of neural network pruning papers can be found at\nhttps://github.com/he-y/Awesome-Pruning . A dedicated website offering a more\ninteractive comparison of structured pruning methods can be found at:\nhttps://huggingface.co/spaces/he-yang/Structured-Pruning-Survey .\n","authors":["Yang He","Lingao Xiao"],"pdf_url":"https://arxiv.org/pdf/2303.00566v2.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2311.18628v1","updated":"2023-11-30T15:33:42Z","published":"2023-11-30T15:33:42Z","title":"A Lightweight Clustering Framework for Unsupervised Semantic\n Segmentation","summary":" Unsupervised semantic segmentation aims to label each pixel of an image to a\ncorresponding class without the use of annotated data. It is a widely\nresearched area as obtaining labeled datasets are expensive. While previous\nworks in the field demonstrated a gradual improvement in segmentation\nperformance, most of them required neural network training. This made\nsegmentation equally expensive, especially when dealing with large-scale\ndatasets. We thereby propose a lightweight clustering framework for\nunsupervised semantic segmentation. Attention features of the self-supervised\nvision transformer exhibit strong foreground-background differentiability. By\nclustering these features into a small number of clusters, we could separate\nforeground and background image patches into distinct groupings. In our\nclustering framework, we first obtain attention features from the\nself-supervised vision transformer. Then we extract Dataset-level,\nCategory-level and Image-level masks by clustering features within the same\ndataset, category and image. We further ensure multilevel clustering\nconsistency across the three levels and this allows us to extract patch-level\nbinary pseudo-masks. Finally, the pseudo-mask is upsampled, refined and class\nassignment is performed according to the CLS token of object regions. Our\nframework demonstrates great promise in unsupervised semantic segmentation and\nachieves state-of-the-art results on PASCAL VOC and MS COCO datasets.\n","authors":["Yau Shing Jonathan Cheung","Xi Chen","Lihe Yang","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.18628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.06214v2","updated":"2023-11-30T15:28:20Z","published":"2022-06-13T14:44:46Z","title":"Real-World Light Field Image Super-Resolution via Degradation Modulation","summary":" Recent years have witnessed the great advances of deep neural networks (DNNs)\nin light field (LF) image super-resolution (SR). However, existing DNN-based LF\nimage SR methods are developed on a single fixed degradation (e.g., bicubic\ndownsampling), and thus cannot be applied to super-resolve real LF images with\ndiverse degradation. In this paper, we propose a simple yet effective method\nfor real-world LF image SR. In our method, a practical LF degradation model is\ndeveloped to formulate the degradation process of real LF images. Then, a\nconvolutional neural network is designed to incorporate the degradation prior\ninto the SR process. By training on LF images using our formulated degradation,\nour network can learn to modulate different degradation while incorporating\nboth spatial and angular information in LF images. Extensive experiments on\nboth synthetically degraded and real-world LF images demonstrate the\neffectiveness of our method. Compared with existing state-of-the-art single and\nLF image SR methods, our method achieves superior SR performance under a wide\nrange of degradation, and generalizes better to real LF images. Codes and\nmodels are available at https://yingqianwang.github.io/LF-DMnet/.\n","authors":["Yingqian Wang","Zhengyu Liang","Longguang Wang","Jungang Yang","Wei An","Yulan Guo"],"pdf_url":"https://arxiv.org/pdf/2206.06214v2.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.12114v3","updated":"2023-11-30T15:26:54Z","published":"2023-08-23T13:09:03Z","title":"Less is More -- Towards parsimonious multi-task models using structured\n sparsity","summary":" Model sparsification in deep learning promotes simpler, more interpretable\nmodels with fewer parameters. This not only reduces the model's memory\nfootprint and computational needs but also shortens inference time. This work\nfocuses on creating sparse models optimized for multiple tasks with fewer\nparameters. These parsimonious models also possess the potential to match or\noutperform dense models in terms of performance. In this work, we introduce\nchannel-wise l1/l2 group sparsity in the shared convolutional layers parameters\n(or weights) of the multi-task learning model. This approach facilitates the\nremoval of extraneous groups i.e., channels (due to l1 regularization) and also\nimposes a penalty on the weights, further enhancing the learning efficiency for\nall tasks (due to l2 regularization). We analyzed the results of group sparsity\nin both single-task and multi-task settings on two widely-used Multi-Task\nLearning (MTL) datasets: NYU-v2 and CelebAMask-HQ. On both datasets, which\nconsist of three different computer vision tasks each, multi-task models with\napproximately 70% sparsity outperform their dense equivalents. We also\ninvestigate how changing the degree of sparsification influences the model's\nperformance, the overall sparsity percentage, the patterns of sparsity, and the\ninference time.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.12114v3.pdf","comment":"accepted at First Conference on Parsimony and Learning (CPAL 2024)"},{"id":"http://arxiv.org/abs/2311.15414v2","updated":"2023-11-30T15:26:20Z","published":"2023-11-26T20:35:19Z","title":"KOPPA: Improving Prompt-based Continual Learning with Key-Query\n Orthogonal Projection and Prototype-based One-Versus-All","summary":" Drawing inspiration from prompt tuning techniques applied to Large Language\nModels, recent methods based on pre-trained ViT networks have achieved\nremarkable results in the field of Continual Learning. Specifically, these\napproaches propose to maintain a set of prompts and allocate a subset of them\nto learn each task using a key-query matching strategy. However, they may\nencounter limitations when lacking control over the correlations between old\ntask queries and keys of future tasks, the shift of features in the latent\nspace, and the relative separation of latent vectors learned in independent\ntasks. In this work, we introduce a novel key-query learning strategy based on\northogonal projection, inspired by model-agnostic meta-learning, to enhance\nprompt matching efficiency and address the challenge of shifting features.\nFurthermore, we introduce a One-Versus-All (OVA) prototype-based component that\nenhances the classification head distinction. Experimental results on benchmark\ndatasets demonstrate that our method empowers the model to achieve results\nsurpassing those of current state-of-the-art approaches by a large margin of up\nto 20%.\n","authors":["Quyen Tran","Lam Tran","Khoat Than","Toan Tran","Dinh Phung","Trung Le"],"pdf_url":"https://arxiv.org/pdf/2311.15414v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06978v3","updated":"2023-11-30T15:23:21Z","published":"2023-09-13T14:13:08Z","title":"Differentiable JPEG: The Devil is in the Details","summary":" JPEG remains one of the most widespread lossy image coding methods. However,\nthe non-differentiable nature of JPEG restricts the application in deep\nlearning pipelines. Several differentiable approximations of JPEG have recently\nbeen proposed to address this issue. This paper conducts a comprehensive review\nof existing diff. JPEG approaches and identifies critical details that have\nbeen missed by previous methods. To this end, we propose a novel diff. JPEG\napproach, overcoming previous limitations. Our approach is differentiable\nw.r.t. the input image, the JPEG quality, the quantization tables, and the\ncolor conversion parameters. We evaluate the forward and backward performance\nof our diff. JPEG approach against existing methods. Additionally, extensive\nablations are performed to evaluate crucial design choices. Our proposed diff.\nJPEG resembles the (non-diff.) reference implementation best, significantly\nsurpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For\nstrong compression rates, we can even improve PSNR by $9.51$dB. Strong\nadversarial attack results are yielded by our diff. JPEG, demonstrating the\neffective gradient approximation. Our code is available at\nhttps://github.com/necla-ml/Diff-JPEG.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2309.06978v3.pdf","comment":"Accepted at WACV 2024. Project page:\n https://christophreich1996.github.io/differentiable_jpeg/"},{"id":"http://arxiv.org/abs/2309.01728v3","updated":"2023-11-30T15:21:01Z","published":"2023-09-04T17:22:10Z","title":"Generative-based Fusion Mechanism for Multi-Modal Tracking","summary":" Generative models (GMs) have received increasing research interest for their\nremarkable capacity to achieve comprehensive understanding. However, their\npotential application in the domain of multi-modal tracking has remained\nrelatively unexplored. In this context, we seek to uncover the potential of\nharnessing generative techniques to address the critical challenge, information\nfusion, in multi-modal tracking. In this paper, we delve into two prominent GM\ntechniques, namely, Conditional Generative Adversarial Networks (CGANs) and\nDiffusion Models (DMs). Different from the standard fusion process where the\nfeatures from each modality are directly fed into the fusion block, we\ncondition these multi-modal features with random noise in the GM framework,\neffectively transforming the original training samples into harder instances.\nThis design excels at extracting discriminative clues from the features,\nenhancing the ultimate tracking performance. To quantitatively gauge the\neffectiveness of our approach, we conduct extensive experiments across two\nmulti-modal tracking tasks, three baseline methods, and three challenging\nbenchmarks. The experimental results demonstrate that the proposed\ngenerative-based fusion mechanism achieves state-of-the-art performance,\nsetting new records on LasHeR and RGBD1K.\n","authors":["Zhangyong Tang","Tianyang Xu","Xuefeng Zhu","Xiao-Jun Wu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2309.01728v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18618v1","updated":"2023-11-30T15:17:46Z","published":"2023-11-30T15:17:46Z","title":"JPPF: Multi-task Fusion for Consistent Panoptic-Part Segmentation","summary":" Part-aware panoptic segmentation is a problem of computer vision that aims to\nprovide a semantic understanding of the scene at multiple levels of\ngranularity. More precisely, semantic areas, object instances, and semantic\nparts are predicted simultaneously. In this paper, we present our Joint\nPanoptic Part Fusion (JPPF) that combines the three individual segmentations\neffectively to obtain a panoptic-part segmentation. Two aspects are of utmost\nimportance for this: First, a unified model for the three problems is desired\nthat allows for mutually improved and consistent representation learning.\nSecond, balancing the combination so that it gives equal importance to all\nindividual results during fusion. Our proposed JPPF is parameter-free and\ndynamically balances its input. The method is evaluated and compared on the\nCityscapes Panoptic Parts (CPP) and Pascal Panoptic Parts (PPP) datasets in\nterms of PartPQ and Part-Whole Quality (PWQ). In extensive experiments, we\nverify the importance of our fair fusion, highlight its most significant impact\nfor areas that can be further segmented into parts, and demonstrate the\ngeneralization capabilities of our design without fine-tuning on 5 additional\ndatasets.\n","authors":["Shishir Muralidhara","Sravan Kumar Jagadeesh","René Schuster","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2311.18618v1.pdf","comment":"Accepted for Springer Nature Computer Science. arXiv admin note:\n substantial text overlap with arXiv:2212.07671"},{"id":"http://arxiv.org/abs/2311.18614v1","updated":"2023-11-30T15:12:57Z","published":"2023-11-30T15:12:57Z","title":"Anatomy and Physiology of Artificial Intelligence in PET Imaging","summary":" The influence of artificial intelligence (AI) within the field of nuclear\nmedicine has been rapidly growing. Many researchers and clinicians are seeking\nto apply AI within PET, and clinicians will soon find themselves engaging with\nAI-based applications all along the chain of molecular imaging, from image\nreconstruction to enhanced reporting. This expanding presence of AI in PET\nimaging will result in greater demand for educational resources for those\nunfamiliar with AI. The objective of this article to is provide an illustrated\nguide to the core principles of modern AI, with specific focus on aspects that\nare most likely to be encountered in PET imaging. We describe convolutional\nneural networks, algorithm training, and explain the components of the commonly\nused U-Net for segmentation and image synthesis.\n","authors":["Tyler J. Bradshaw","Alan B. McMillan"],"pdf_url":"https://arxiv.org/pdf/2311.18614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18612v1","updated":"2023-11-30T15:11:03Z","published":"2023-11-30T15:11:03Z","title":"Cancer-Net PCa-Gen: Synthesis of Realistic Prostate Diffusion Weighted\n Imaging Data via Anatomic-Conditional Controlled Latent Diffusion","summary":" In Canada, prostate cancer is the most common form of cancer in men and\naccounted for 20% of new cancer cases for this demographic in 2022. Due to\nrecent successes in leveraging machine learning for clinical decision support,\nthere has been significant interest in the development of deep neural networks\nfor prostate cancer diagnosis, prognosis, and treatment planning using\ndiffusion weighted imaging (DWI) data. A major challenge hindering widespread\nadoption in clinical use is poor generalization of such networks due to\nscarcity of large-scale, diverse, balanced prostate imaging datasets for\ntraining such networks. In this study, we explore the efficacy of latent\ndiffusion for generating realistic prostate DWI data through the introduction\nof an anatomic-conditional controlled latent diffusion strategy. To the best of\nthe authors' knowledge, this is the first study to leverage conditioning for\nsynthesis of prostate cancer imaging. Experimental results show that the\nproposed strategy, which we call Cancer-Net PCa-Gen, enhances synthesis of\ndiverse prostate images through controllable tumour locations and better\nanatomical and textural fidelity. These crucial features make it well-suited\nfor augmenting real patient data, enabling neural networks to be trained on a\nmore diverse and comprehensive data distribution. The Cancer-Net PCa-Gen\nframework and sample images have been made publicly available at\nhttps://www.kaggle.com/datasets/deetsadi/cancer-net-pca-gen-dataset as a part\nof a global open-source initiative dedicated to accelerating advancement in\nmachine learning to aid clinicians in the fight against cancer.\n","authors":["Aditya Sridhar","Chi-en Amy Tai","Hayden Gunraj","Yuhao Chen","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2311.18612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18610v1","updated":"2023-11-30T15:10:21Z","published":"2023-11-30T15:10:21Z","title":"DiffCAD: Weakly-Supervised Probabilistic CAD Model Retrieval and\n Alignment from an RGB Image","summary":" Perceiving 3D structures from RGB images based on CAD model primitives can\nenable an effective, efficient 3D object-based representation of scenes.\nHowever, current approaches rely on supervision from expensive annotations of\nCAD models associated with real images, and encounter challenges due to the\ninherent ambiguities in the task -- both in depth-scale ambiguity in monocular\nperception, as well as inexact matches of CAD database models to real\nobservations. We thus propose DiffCAD, the first weakly-supervised\nprobabilistic approach to CAD retrieval and alignment from an RGB image. We\nformulate this as a conditional generative task, leveraging diffusion to learn\nimplicit probabilistic models capturing the shape, pose, and scale of CAD\nobjects in an image. This enables multi-hypothesis generation of different\nplausible CAD reconstructions, requiring only a few hypotheses to characterize\nambiguities in depth/scale and inexact shape matches. Our approach is trained\nonly on synthetic data, leveraging monocular depth and mask estimates to enable\nrobust zero-shot adaptation to various real target domains. Despite being\ntrained solely on synthetic data, our multi-hypothesis approach can even\nsurpass the supervised state-of-the-art on the Scan2CAD dataset by 5.9% with 8\nhypotheses.\n","authors":["Daoyi Gao","Dávid Rozenberszki","Stefan Leutenegger","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2311.18610v1.pdf","comment":"Project page: https://daoyig.github.io/DiffCAD/ Video:\n https://www.youtube.com/watch?v=PCursyPosMY"},{"id":"http://arxiv.org/abs/2311.18608v1","updated":"2023-11-30T15:06:10Z","published":"2023-11-30T15:06:10Z","title":"Contrastive Denoising Score for Text-guided Latent Diffusion Image\n Editing","summary":" With the remarkable advent of text-to-image diffusion models, image editing\nmethods have become more diverse and continue to evolve. A promising recent\napproach in this realm is Delta Denoising Score (DDS) - an image editing\ntechnique based on Score Distillation Sampling (SDS) framework that leverages\nthe rich generative prior of text-to-image diffusion models. However, relying\nsolely on the difference between scoring functions is insufficient for\npreserving specific structural elements from the original image, a crucial\naspect of image editing. Inspired by the similarity and importance differences\nbetween DDS and the contrastive learning for unpaired image-to-image\ntranslation (CUT), here we present an embarrassingly simple yet very powerful\nmodification of DDS, called Contrastive Denoising Score (CDS), for latent\ndiffusion models (LDM). Specifically, to enforce structural correspondence\nbetween the input and output while maintaining the controllability of contents,\nwe introduce a straightforward approach to regulate structural consistency\nusing CUT loss within the DDS framework. To calculate this loss, instead of\nemploying auxiliary networks, we utilize the intermediate features of LDM, in\nparticular, those from the self-attention layers, which possesses rich spatial\ninformation. Our approach enables zero-shot image-to-image translation and\nneural radiance field (NeRF) editing, achieving a well-balanced interplay\nbetween maintaining the structural details and transforming content.\nQualitative results and comparisons demonstrates the effectiveness of our\nproposed method. Project page with code is available at\nhttps://hyelinnam.github.io/CDS/.\n","authors":["Hyelin Nam","Gihyun Kwon","Geon Yeong Park","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.18608v1.pdf","comment":"Project page: https://hyelinnam.github.io/CDS/"},{"id":"http://arxiv.org/abs/2311.01714v2","updated":"2023-11-30T15:02:57Z","published":"2023-11-03T05:01:51Z","title":"EXIM: A Hybrid Explicit-Implicit Representation for Text-Guided 3D Shape\n Generation","summary":" This paper presents a new text-guided technique for generating 3D shapes. The\ntechnique leverages a hybrid 3D shape representation, namely EXIM, combining\nthe strengths of explicit and implicit representations. Specifically, the\nexplicit stage controls the topology of the generated 3D shapes and enables\nlocal modifications, whereas the implicit stage refines the shape and paints it\nwith plausible colors. Also, the hybrid approach separates the shape and color\nand generates color conditioned on shape to ensure shape-color consistency.\nUnlike the existing state-of-the-art methods, we achieve high-fidelity shape\ngeneration from natural-language descriptions without the need for\ntime-consuming per-shape optimization or reliance on human-annotated texts\nduring training or test-time optimization. Further, we demonstrate the\napplicability of our approach to generate indoor scenes with consistent styles\nusing text-induced 3D shapes. Through extensive experiments, we demonstrate the\ncompelling quality of our results and the high coherency of our generated\nshapes with the input texts, surpassing the performance of existing methods by\na significant margin. Codes and models are released at\nhttps://github.com/liuzhengzhe/EXIM.\n","authors":["Zhengzhe Liu","Jingyu Hu","Ka-Hei Hui","Xiaojuan Qi","Daniel Cohen-Or","Chi-Wing Fu"],"pdf_url":"https://arxiv.org/pdf/2311.01714v2.pdf","comment":"SIGGRAPH Asia 2023 & TOG Project page:\n https://liuzhengzhe.github.io/EXIM.github.io/"},{"id":"http://arxiv.org/abs/2311.18605v1","updated":"2023-11-30T15:02:13Z","published":"2023-11-30T15:02:13Z","title":"Learning Triangular Distribution in Visual World","summary":" Convolution neural network is successful in pervasive vision tasks, including\nlabel distribution learning, which usually takes the form of learning an\ninjection from the non-linear visual features to the well-defined labels.\nHowever, how the discrepancy between features is mapped to the label\ndiscrepancy is ambient, and its correctness is not guaranteed.To address these\nproblems, we study the mathematical connection between feature and its label,\npresenting a general and simple framework for label distribution learning. We\npropose a so-called Triangular Distribution Transform (TDT) to build an\ninjective function between feature and label, guaranteeing that any symmetric\nfeature discrepancy linearly reflects the difference between labels. The\nproposed TDT can be used as a plug-in in mainstream backbone networks to\naddress different label distribution learning tasks. Experiments on Facial Age\nRecognition, Illumination Chromaticity Estimation, and Aesthetics assessment\nshow that TDT achieves on-par or better results than the prior arts.\n","authors":["Ping Chen","Xingpeng Zhang","Chengtao Zhou","Dichao Fan","Peng Tu","Le Zhang","Yanlin Qian"],"pdf_url":"https://arxiv.org/pdf/2311.18605v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.16526v2","updated":"2023-11-30T14:57:59Z","published":"2023-05-25T23:07:43Z","title":"Extending Explainable Boosting Machines to Scientific Image Data","summary":" As the deployment of computer vision technology becomes increasingly common\nin science, the need for explanations of the system and its output has become a\nfocus of great concern. Driven by the pressing need for interpretable models in\nscience, we propose the use of Explainable Boosting Machines (EBMs) for\nscientific image data. Inspired by an important application underpinning the\ndevelopment of quantum technologies, we apply EBMs to cold-atom soliton image\ndata tabularized using Gabor Wavelet Transform-based techniques that preserve\nthe spatial structure of the data. In doing so, we demonstrate the use of EBMs\nfor image data for the first time and show that our approach provides\nexplanations that are consistent with human intuition about the data.\n","authors":["Daniel Schug","Sai Yerramreddy","Rich Caruana","Craig Greenberg","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2305.16526v2.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.15010v2","updated":"2023-11-30T14:57:30Z","published":"2023-11-25T12:33:54Z","title":"Adapter is All You Need for Tuning Visual Tasks","summary":" Pre-training & fine-tuning can enhance the transferring efficiency and\nperformance in visual tasks. Recent delta-tuning methods provide more options\nfor visual classification tasks. Despite their success, existing visual\ndelta-tuning art fails to exceed the upper limit of full fine-tuning on\nchallenging tasks like instance segmentation and semantic segmentation. To find\na competitive alternative to full fine-tuning, we propose the Multi-cognitive\nVisual Adapter (Mona) tuning, a novel adapter-based tuning method. First, we\nintroduce multiple vision-friendly filters into the adapter to enhance its\nability to process visual signals, while previous methods mainly rely on\nlanguage-friendly linear filters. Second, we add the scaled normalization layer\nin the adapter to regulate the distribution of input features for visual\nfilters. To fully demonstrate the practicality and generality of Mona, we\nconduct experiments on multiple representative visual tasks, including instance\nsegmentation on COCO, semantic segmentation on ADE20K, object detection on\nPascal VOC, and image classification on several common datasets. Exciting\nresults illustrate that Mona surpasses full fine-tuning on all these tasks and\nis the only delta-tuning method outperforming full fine-tuning on instance\nsegmentation and semantic segmentation tasks. For example, Mona achieves a 1%\nperformance gain on the COCO dataset compared to full fine-tuning.\nComprehensive results suggest that Mona-tuning is more suitable for retaining\nand utilizing the capabilities of pre-trained models than full fine-tuning. The\ncode will be released at https://github.com/Leiyi-Hu/mona.\n","authors":["Dongshuo Yin","Leiyi Hu","Bin Li","Youqun Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14631v2","updated":"2023-11-30T14:42:07Z","published":"2023-11-24T17:55:10Z","title":"CatVersion: Concatenating Embeddings for Diffusion-Based Text-to-Image\n Personalization","summary":" We propose CatVersion, an inversion-based method that learns the personalized\nconcept through a handful of examples. Subsequently, users can utilize text\nprompts to generate images that embody the personalized concept, thereby\nachieving text-to-image personalization. In contrast to existing approaches\nthat emphasize word embedding learning or parameter fine-tuning for the\ndiffusion model, which potentially causes concept dilution or overfitting, our\nmethod concatenates embeddings on the feature-dense space of the text encoder\nin the diffusion model to learn the gap between the personalized concept and\nits base class, aiming to maximize the preservation of prior knowledge in\ndiffusion models while restoring the personalized concepts. To this end, we\nfirst dissect the text encoder's integration in the image generation process to\nidentify the feature-dense space of the encoder. Afterward, we concatenate\nembeddings on the Keys and Values in this space to learn the gap between the\npersonalized concept and its base class. In this way, the concatenated\nembeddings ultimately manifest as a residual on the original attention output.\nTo more accurately and unbiasedly quantify the results of personalized image\ngeneration, we improve the CLIP image alignment score based on masks.\nQualitatively and quantitatively, CatVersion helps to restore personalization\nconcepts more faithfully and enables more robust editing.\n","authors":["Ruoyu Zhao","Mingrui Zhu","Shiyin Dong","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2311.14631v2.pdf","comment":"For the project page, please visit\n https://royzhao926.github.io/CatVersion-page/"},{"id":"http://arxiv.org/abs/2305.08192v2","updated":"2023-11-30T14:40:54Z","published":"2023-05-14T16:02:36Z","title":"Diffusion Models for Imperceptible and Transferable Adversarial Attack","summary":" Many existing adversarial attacks generate $L_p$-norm perturbations on image\nRGB space. Despite some achievements in transferability and attack success\nrate, the crafted adversarial examples are easily perceived by human eyes.\nTowards visual imperceptibility, some recent works explore unrestricted attacks\nwithout $L_p$-norm constraints, yet lacking transferability of attacking\nblack-box models. In this work, we propose a novel imperceptible and\ntransferable attack by leveraging both the generative and discriminative power\nof diffusion models. Specifically, instead of direct manipulation in pixel\nspace, we craft perturbations in the latent space of diffusion models. Combined\nwith well-designed content-preserving structures, we can generate\nhuman-insensitive perturbations embedded with semantic clues. For better\ntransferability, we further \"deceive\" the diffusion model which can be viewed\nas an implicit recognition surrogate, by distracting its attention away from\nthe target regions. To our knowledge, our proposed method, DiffAttack, is the\nfirst that introduces diffusion models into the adversarial attack field.\nExtensive experiments on various model structures, datasets, and defense\nmethods have demonstrated the superiority of our attack over the existing\nattack methods.\n","authors":["Jianqi Chen","Hao Chen","Keyan Chen","Yilan Zhang","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2305.08192v2.pdf","comment":"Code Page: https://github.com/WindVChen/DiffAttack. In Paper Version\n v2, we incorporate more discussions and experiments"},{"id":"http://arxiv.org/abs/2311.18592v1","updated":"2023-11-30T14:35:51Z","published":"2023-11-30T14:35:51Z","title":"Semantic-Aware Frame-Event Fusion based Pattern Recognition via Large\n Vision-Language Models","summary":" Pattern recognition through the fusion of RGB frames and Event streams has\nemerged as a novel research area in recent years. Current methods typically\nemploy backbone networks to individually extract the features of RGB frames and\nevent streams, and subsequently fuse these features for pattern recognition.\nHowever, we posit that these methods may suffer from key issues like sematic\ngaps and small-scale backbone networks. In this study, we introduce a novel\npattern recognition framework that consolidates the semantic labels, RGB\nframes, and event streams, leveraging pre-trained large-scale vision-language\nmodels. Specifically, given the input RGB frames, event streams, and all the\npredefined semantic labels, we employ a pre-trained large-scale vision model\n(CLIP vision encoder) to extract the RGB and event features. To handle the\nsemantic labels, we initially convert them into language descriptions through\nprompt engineering, and then obtain the semantic features using the pre-trained\nlarge-scale language model (CLIP text encoder). Subsequently, we integrate the\nRGB/Event features and semantic features using multimodal Transformer networks.\nThe resulting frame and event tokens are further amplified using self-attention\nlayers. Concurrently, we propose to enhance the interactions between text\ntokens and RGB/Event tokens via cross-attention. Finally, we consolidate all\nthree modalities using self-attention and feed-forward layers for recognition.\nComprehensive experiments on the HARDVS and PokerEvent datasets fully\nsubstantiate the efficacy of our proposed SAFE model. The source code will be\nmade available at https://github.com/Event-AHU/SAFE_LargeVLM.\n","authors":["Dong Li","Jiandong Jin","Yuhao Zhang","Yanlin Zhong","Yaoyang Wu","Lan Chen","Xiao Wang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2311.18592v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2311.14049v2","updated":"2023-11-30T14:34:14Z","published":"2023-11-23T15:05:12Z","title":"Assessment of Deep Learning Segmentation for Real-Time Free-Breathing\n Cardiac Magnetic Resonance Imaging","summary":" In recent years, a variety of deep learning networks for cardiac MRI (CMR)\nsegmentation have been developed and analyzed. However, nearly all of them are\nfocused on cine CMR under breathold. In this work, accuracy of deep learning\nmethods is assessed for volumetric analysis (via segmentation) of the left\nventricle in real-time free-breathing CMR at rest and under exercise stress.\nData from healthy volunteers (n=15) for cine and real-time free-breathing CMR\nwere analyzed retrospectively. Segmentations of a commercial software (comDL)\nand a freely available neural network (nnU-Net), were compared to a reference\ncreated via the manual correction of comDL segmentation. Segmentation of left\nventricular endocardium (LV), left ventricular myocardium (MYO), and right\nventricle (RV) is evaluated for both end-systolic and end-diastolic phases and\nanalyzed with Dice's coefficient (DC). The volumetric analysis includes LV\nend-diastolic volume (EDV), LV end-systolic volume (ESV), and LV ejection\nfraction (EF). For cine CMR, nnU-Net and comDL achieve a DC above 0.95 for LV\nand 0.9 for MYO, and RV. For real-time CMR, the accuracy of nnU-Net exceeds\nthat of comDL overall. For real-time CMR at rest, nnU-Net achieves a DC of 0.94\nfor LV, 0.89 for MYO, and 0.90 for RV; mean absolute differences between\nnnU-Net and reference are 2.9mL for EDV, 3.5mL for ESV and 2.6% for EF. For\nreal-time CMR under exercise stress, nnU-Net achieves a DC of 0.92 for LV, 0.85\nfor MYO, and 0.83 for RV; mean absolute differences between nnU-Net and\nreference are 11.4mL for EDV, 2.9mL for ESV and 3.6% for EF. Deep learning\nmethods designed or trained for cine CMR segmentation can perform well on\nreal-time CMR. For real-time free-breathing CMR at rest, the performance of\ndeep learning methods is comparable to inter-observer variability in cine CMR\nand is usable or fully automatic segmentation.\n","authors":["Martin Schilling","Christina Unterberg-Buchwald","Joachim Lotz","Martin Uecker"],"pdf_url":"https://arxiv.org/pdf/2311.14049v2.pdf","comment":"Martin Schilling and Christina Unterberg-Buchwald contributed equally\n to this work"},{"id":"http://arxiv.org/abs/2311.18578v1","updated":"2023-11-30T14:17:57Z","published":"2023-11-30T14:17:57Z","title":"Communication-Efficient Heterogeneous Federated Learning with\n Generalized Heavy-Ball Momentum","summary":" Federated Learning (FL) is the state-of-the-art approach for learning from\ndecentralized data in privacy-constrained scenarios. As the current literature\nreports, the main problems associated with FL refer to system and statistical\nchallenges: the former ones demand for efficient learning from edge devices,\nincluding lowering communication bandwidth and frequency, while the latter\nrequire algorithms robust to non-iidness. State-of-art approaches either\nguarantee convergence at increased communication cost or are not sufficiently\nrobust to handle extreme heterogeneous local distributions. In this work we\npropose a novel generalization of the heavy-ball momentum, and present FedHBM\nto effectively address statistical heterogeneity in FL without introducing any\ncommunication overhead. We conduct extensive experimentation on common FL\nvision and NLP datasets, showing that our FedHBM algorithm empirically yields\nbetter model quality and higher convergence speed w.r.t. the state-of-art,\nespecially in pathological non-iid scenarios. While being designed for\ncross-silo settings, we show how FedHBM is applicable in moderate-to-high\ncross-device scenarios, and how good model initializations (e.g. pre-training)\ncan be exploited for prompt acceleration. Extended experimentation on\nlarge-scale real-world federated datasets further corroborates the\neffectiveness of our approach for real-world FL applications.\n","authors":["Riccardo Zaccone","Carlo Masone","Marco Ciccone"],"pdf_url":"https://arxiv.org/pdf/2311.18578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07404v3","updated":"2023-11-30T14:16:13Z","published":"2023-06-12T20:12:02Z","title":"Compositor: Bottom-up Clustering and Compositing for Robust Part and\n Object Segmentation","summary":" In this work, we present a robust approach for joint part and object\nsegmentation. Specifically, we reformulate object and part segmentation as an\noptimization problem and build a hierarchical feature representation including\npixel, part, and object-level embeddings to solve it in a bottom-up clustering\nmanner. Pixels are grouped into several clusters where the part-level\nembeddings serve as cluster centers. Afterwards, object masks are obtained by\ncompositing the part proposals. This bottom-up interaction is shown to be\neffective in integrating information from lower semantic levels to higher\nsemantic levels. Based on that, our novel approach Compositor produces part and\nobject segmentation masks simultaneously while improving the mask quality.\nCompositor achieves state-of-the-art performance on PartImageNet and\nPascal-Part by outperforming previous methods by around 0.9% and 1.3% on\nPartImageNet, 0.4% and 1.7% on Pascal-Part in terms of part and object mIoU and\ndemonstrates better robustness against occlusion by around 4.4% and 7.1% on\npart and object respectively. Code will be available at\nhttps://github.com/TACJu/Compositor.\n","authors":["Ju He","Jieneng Chen","Ming-Xian Lin","Qihang Yu","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2306.07404v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18576v1","updated":"2023-11-30T14:15:39Z","published":"2023-11-30T14:15:39Z","title":"Fingerprint Matching with Localized Deep Representation","summary":" Compared to minutia-based fingerprint representations, fixed-length\nrepresentations are attractive due to simple and efficient matching. However,\nfixed-length fingerprint representations are limited in accuracy when matching\nfingerprints with different visible areas, which can occur due to different\nfinger poses or acquisition methods. To address this issue, we propose a\nlocalized deep representation of fingerprint, named LDRF. By focusing on the\ndiscriminative characteristics within local regions, LDRF provides a more\nrobust and accurate fixed-length representation for fingerprints with variable\nvisible areas. LDRF can be adapted to retain information within any valid area,\nmaking it highly flexible. The matching scores produced by LDRF also exhibit\nintuitive statistical characteristics, which led us to propose a matching score\nnormalization technique to mitigate the uncertainty in the cases of very small\noverlapping area. With this new technique, we can maintain a high level of\naccuracy and reliability in our fingerprint matching, even as the size of the\ndatabase grows rapidly. Our experimental results on 21 datasets containing over\n140K fingerprints of various finger poses and impression types show that LDRF\noutperforms other fixed-length representations and is robust to sensing\ntechnologies and impression types. Besides, the proposed matching score\nnormalization effectively reduces the false match rate (FMR) in large-scale\nidentification experiments comprising over 5.11 million fingerprints.\nSpecifically, this technique results in a reduction of two orders of magnitude\ncompared to matching without matching score normalization and five orders of\nmagnitude compared to prior works.\n","authors":["Yongjie Duan","Zhiyu Pan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v1.pdf","comment":"18 pages, 20 figures"},{"id":"http://arxiv.org/abs/2109.06296v3","updated":"2023-11-30T14:15:06Z","published":"2021-09-13T20:12:42Z","title":"Monocular Camera Localization for Automated Vehicles Using Image\n Retrieval","summary":" We address the problem of finding the current position and heading angle of\nan autonomous vehicle in real-time using a single camera. Compared to methods\nwhich require LiDARs and high definition (HD) 3D maps in real-time, the\nproposed approach is easily scalable and computationally efficient, at the\nprice of lower precision.\n The new method combines and adapts existing algorithms in three different\nfields: image retrieval, mapping database, and particle filtering. The result\nis a simple, real-time localization method using an image retrieval method\nwhose performance is comparable to other monocular camera localization methods\nwhich use a map built with LiDARs.\n We evaluate the proposed method using the KITTI odometry dataset and via\nclosed-loop experiments with an indoor 1:10 autonomous vehicle. The tests\ndemonstrate real-time capability and a 10cm level accuracy. Also, experimental\nresults of the closed-loop indoor tests show the presence of a positive\nfeedback loop between the localization error and the control error. Such\nphenomena is analysed in details at the end of the article.\n","authors":["Eunhyek Joa","Yibo Sun","Francesco Borrelli"],"pdf_url":"https://arxiv.org/pdf/2109.06296v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17644v2","updated":"2023-11-30T14:06:42Z","published":"2023-05-28T06:19:36Z","title":"Caterpillar: A Pure-MLP Architecture with Shifted-Pillars-Concatenation","summary":" Modeling in Computer Vision has evolved to MLPs. Vision MLPs naturally lack\nlocal modeling capability, to which the simplest treatment is combined with\nconvolutional layers. Convolution, famous for its sliding window scheme, also\nsuffers from this scheme of redundancy and low computational efficiency. In\nthis paper, we seek to dispense with the windowing scheme and introduce a more\nelaborate and effective approach to exploiting locality. To this end, we\npropose a new MLP module, namely Shifted-Pillars-Concatenation (SPC), that\nconsists of two steps of processes: (1) Pillars-Shift, which generates four\nneighboring maps by shifting the input image along four directions, and (2)\nPillars-Concatenation, which applies linear transformations and concatenation\non the maps to aggregate local features. SPC module offers superior local\nmodeling power and performance gains, making it a promising alternative to the\nconvolutional layer. Then, we build a pure-MLP architecture called Caterpillar\nby replacing the convolutional layer with the SPC module in a hybrid model of\nsMLPNet. Extensive experiments show Caterpillar's excellent performance and\nscalability on both ImageNet-1K and small-scale classification benchmarks.\n","authors":["Jin Sun","Xiaoshuang Shi","Zhiyuan Wang","Kaidi Xu","Heng Tao Shen","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.17644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18572v1","updated":"2023-11-30T14:06:27Z","published":"2023-11-30T14:06:27Z","title":"Overcoming Label Noise for Source-free Unsupervised Video Domain\n Adaptation","summary":" Despite the progress seen in classification methods, current approaches for\nhandling videos with distribution shifts in source and target domains remain\nsource-dependent as they require access to the source data during the\nadaptation stage. In this paper, we present a self-training based source-free\nvideo domain adaptation approach to address this challenge by bridging the gap\nbetween the source and the target domains. We use the source pre-trained model\nto generate pseudo-labels for the target domain samples, which are inevitably\nnoisy. Thus, we treat the problem of source-free video domain adaptation as\nlearning from noisy labels and argue that the samples with correct\npseudo-labels can help us in adaptation. To this end, we leverage the\ncross-entropy loss as an indicator of the correctness of the pseudo-labels and\nuse the resulting small-loss samples from the target domain for fine-tuning the\nmodel. We further enhance the adaptation performance by implementing a\nteacher-student framework, in which the teacher, which is updated gradually,\nproduces reliable pseudo-labels. Meanwhile, the student undergoes fine-tuning\non the target domain videos using these generated pseudo-labels to improve its\nperformance. Extensive experimental evaluations show that our methods, termed\nas CleanAdapt, CleanAdapt + TS, achieve state-of-the-art results, outperforming\nthe existing approaches on various open datasets. Our source code is publicly\navailable at https://avijit9.github.io/CleanAdapt.\n","authors":["Avijit Dasgupta","C. V. Jawahar","Karteek Alahari"],"pdf_url":"https://arxiv.org/pdf/2311.18572v1.pdf","comment":"Extended version of our ICVGIP paper"},{"id":"http://arxiv.org/abs/2304.03492v2","updated":"2023-11-30T14:00:56Z","published":"2023-04-07T06:23:54Z","title":"ClothCombo: Modeling Inter-Cloth Interaction for Draping Multi-Layered\n Clothes","summary":" We present ClothCombo, a pipeline to drape arbitrary combinations of clothes\non 3D human models with varying body shapes and poses. While existing\nlearning-based approaches for draping clothes have shown promising results,\nmulti-layered clothing remains challenging as it is non-trivial to model\ninter-cloth interaction. To this end, our method utilizes a GNN-based network\nto efficiently model the interaction between clothes in different layers, thus\nenabling multi-layered clothing. Specifically, we first create feature\nembedding for each cloth using a topology-agnostic network. Then, the draping\nnetwork deforms all clothes to fit the target body shape and pose without\nconsidering inter-cloth interaction. Lastly, the untangling network predicts\nthe per-vertex displacements in a way that resolves interpenetration between\nclothes. In experiments, the proposed model demonstrates strong performance in\ncomplex multi-layered scenarios. Being agnostic to cloth topology, our method\ncan be readily used for layered virtual try-on of real clothes in diverse poses\nand combinations of clothes.\n","authors":["Dohae Lee","Hyun Kang","In-Kwon Lee"],"pdf_url":"https://arxiv.org/pdf/2304.03492v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18564v1","updated":"2023-11-30T13:55:29Z","published":"2023-11-30T13:55:29Z","title":"Seam-guided local alignment and stitching for large parallax images","summary":" Seam-cutting methods have been proven effective in the composition step of\nimage stitching, especially for images with parallax. However, the\neffectiveness of seam-cutting usually depends on that images can be roughly\naligned such that there exists a local region where a plausible seam can be\nfound. For images with large parallax, current alignment methods often fall\nshort of expectations. In this paper, we propose a local alignment and\nstitching method guided by seam quality evaluation. First, we use existing\nimage alignment and seam-cutting methods to calculate an initial seam and\nevaluate the quality of pixels along the seam. Then, for pixels with low\nqualities, we separate their enclosing patches in the aligned images and\nlocally align them by extracting modified dense correspondences via SIFT flow.\nFinally, we composite the aligned patches via seam-cutting and merge them into\nthe original aligned result to generate the final mosaic. Experiments show that\ncompared with the state-of-the-art seam-cutting methods, our result is more\nplausible and with fewer artifacts. The code will be available at\nhttps://github.com/tlliao/Seam-guided-local-alignment.\n","authors":["Tianli Liao","Chenyang Zhao","Lei Li","Heling Cao"],"pdf_url":"https://arxiv.org/pdf/2311.18564v1.pdf","comment":"13 pages, 12 figures, in peer review"},{"id":"http://arxiv.org/abs/2311.18561v1","updated":"2023-11-30T13:53:50Z","published":"2023-11-30T13:53:50Z","title":"Periodic Vibration Gaussian: Dynamic Urban Scene Reconstruction and\n Real-time Rendering","summary":" Modeling dynamic, large-scale urban scenes is challenging due to their highly\nintricate geometric structures and unconstrained dynamics in both space and\ntime. Prior methods often employ high-level architectural priors, separating\nstatic and dynamic elements, resulting in suboptimal capture of their\nsynergistic interactions. To address this challenge, we present a unified\nrepresentation model, called Periodic Vibration Gaussian (PVG). PVG builds upon\nthe efficient 3D Gaussian splatting technique, originally designed for static\nscene representation, by introducing periodic vibration-based temporal\ndynamics. This innovation enables PVG to elegantly and uniformly represent the\ncharacteristics of various objects and elements in dynamic urban scenes. To\nenhance temporally coherent representation learning with sparse training data,\nwe introduce a novel flow-based temporal smoothing mechanism and a\nposition-aware adaptive control strategy. Extensive experiments on Waymo Open\nDataset and KITTI benchmarks demonstrate that PVG surpasses state-of-the-art\nalternatives in both reconstruction and novel view synthesis for both dynamic\nand static scenes. Notably, PVG achieves this without relying on manually\nlabeled object bounding boxes or expensive optical flow estimation. Moreover,\nPVG exhibits 50/6000-fold acceleration in training/rendering over the best\nalternative.\n","authors":["Yurui Chen","Chun Gu","Junzhe Jiang","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18561v1.pdf","comment":"Project page: https://fudan-zvg.github.io/PVG/"},{"id":"http://arxiv.org/abs/2109.11369v3","updated":"2023-11-30T13:51:31Z","published":"2021-09-23T13:30:18Z","title":"Recent Advances of Continual Learning in Computer Vision: An Overview","summary":" In contrast to batch learning where all training data is available at once,\ncontinual learning represents a family of methods that accumulate knowledge and\nlearn continuously with data available in sequential order. Similar to the\nhuman learning process with the ability of learning, fusing, and accumulating\nnew knowledge coming at different time steps, continual learning is considered\nto have high practical significance. Hence, continual learning has been studied\nin various artificial intelligence tasks. In this paper, we present a\ncomprehensive review of the recent progress of continual learning in computer\nvision. In particular, the works are grouped by their representative\ntechniques, including regularization, knowledge distillation, memory,\ngenerative replay, parameter isolation, and a combination of the above\ntechniques. For each category of these techniques, both its characteristics and\napplications in computer vision are presented. At the end of this overview,\nseveral subareas, where continuous knowledge accumulation is potentially\nhelpful while continual learning has not been well studied, are discussed.\n","authors":["Haoxuan Qu","Hossein Rahmani","Li Xu","Bryan Williams","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2109.11369v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18559v1","updated":"2023-11-30T13:50:38Z","published":"2023-11-30T13:50:38Z","title":"FediOS: Decoupling Orthogonal Subspaces for Personalization in\n Feature-skew Federated Learning","summary":" Personalized federated learning (pFL) enables collaborative training among\nmultiple clients to enhance the capability of customized local models. In pFL,\nclients may have heterogeneous (also known as non-IID) data, which poses a key\nchallenge in how to decouple the data knowledge into generic knowledge for\nglobal sharing and personalized knowledge for preserving local personalization.\nA typical way of pFL focuses on label distribution skew, and they adopt a\ndecoupling scheme where the model is split into a common feature extractor and\ntwo prediction heads (generic and personalized). However, such a decoupling\nscheme cannot solve the essential problem of feature skew heterogeneity,\nbecause a common feature extractor cannot decouple the generic and personalized\nfeatures. Therefore, in this paper, we rethink the architecture decoupling\ndesign for feature-skew pFL and propose an effective pFL method called FediOS.\nIn FediOS, we reformulate the decoupling into two feature extractors (generic\nand personalized) and one shared prediction head. Orthogonal projections are\nused for clients to map the generic features into one common subspace and\nscatter the personalized features into different subspaces to achieve\ndecoupling for them. In addition, a shared prediction head is trained to\nbalance the importance of generic and personalized features during inference.\nExtensive experiments on four vision datasets demonstrate our method reaches\nstate-of-the-art pFL performances under feature skew heterogeneity.\n","authors":["Lingzhi Gao","Zexi Li","Yang Lu","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2311.18559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18553v1","updated":"2023-11-30T13:46:05Z","published":"2023-11-30T13:46:05Z","title":"Heterogeneous Graph-based Trajectory Prediction using Local Map Context\n and Social Interactions","summary":" Precisely predicting the future trajectories of surrounding traffic\nparticipants is a crucial but challenging problem in autonomous driving, due to\ncomplex interactions between traffic agents, map context and traffic rules.\nVector-based approaches have recently shown to achieve among the best\nperformances on trajectory prediction benchmarks. These methods model simple\ninteractions between traffic agents but don't distinguish between relation-type\nand attributes like their distance along the road. Furthermore, they represent\nlanes only by sequences of vectors representing center lines and ignore context\ninformation like lane dividers and other road elements. We present a novel\napproach for vector-based trajectory prediction that addresses these\nshortcomings by leveraging three crucial sources of information: First, we\nmodel interactions between traffic agents by a semantic scene graph, that\naccounts for the nature and important features of their relation. Second, we\nextract agent-centric image-based map features to model the local map context.\nFinally, we generate anchor paths to enforce the policy in multi-modal\nprediction to permitted trajectories only. Each of these three enhancements\nshows advantages over the baseline model HoliGraph.\n","authors":["Daniel Grimm","Maximilian Zipfl","Felix Hertlein","Alexander Naumann","Jürgen Lüttin","Steffen Thoma","Stefan Schmid","Lavdim Halilaj","Achim Rettinger","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.18553v1.pdf","comment":"Accepted on IEEE ITSC 2023"},{"id":"http://arxiv.org/abs/2311.18540v1","updated":"2023-11-30T13:22:15Z","published":"2023-11-30T13:22:15Z","title":"Match me if you can: Semantic Correspondence Learning with Unpaired\n Images","summary":" Recent approaches for semantic correspondence have focused on obtaining\nhigh-quality correspondences using a complicated network, refining the\nambiguous or noisy matching points. Despite their performance improvements,\nthey remain constrained by the limited training pairs due to costly point-level\nannotations. This paper proposes a simple yet effective method that performs\ntraining with unlabeled pairs to complement both limited image pairs and sparse\npoint pairs, requiring neither extra labeled keypoints nor trainable modules.\nWe fundamentally extend the data quantity and variety by augmenting new\nunannotated pairs not primitively provided as training pairs in benchmarks.\nUsing a simple teacher-student framework, we offer reliable pseudo\ncorrespondences to the student network via machine supervision. Finally, the\nperformance of our network is steadily improved by the proposed iterative\ntraining, putting back the student as a teacher to generate refined labels and\ntrain a new student repeatedly. Our models outperform the milestone baselines,\nincluding state-of-the-art methods on semantic correspondence benchmarks.\n","authors":["Jiwon Kim","Byeongho Heo","Sangdoo Yun","Seungryong Kim","Dongyoon Han"],"pdf_url":"https://arxiv.org/pdf/2311.18540v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2311.18537v1","updated":"2023-11-30T13:20:09Z","published":"2023-11-30T13:20:09Z","title":"MaXTron: Mask Transformer with Trajectory Attention for Video Panoptic\n Segmentation","summary":" Video panoptic segmentation requires consistently segmenting (for both\n`thing' and `stuff' classes) and tracking objects in a video over time. In this\nwork, we present MaXTron, a general framework that exploits Mask XFormer with\nTrajectory Attention to tackle the task. MaXTron enriches an off-the-shelf mask\ntransformer by leveraging trajectory attention. The deployed mask transformer\ntakes as input a short clip consisting of only a few frames and predicts the\nclip-level segmentation. To enhance the temporal consistency, MaXTron employs\nwithin-clip and cross-clip tracking modules, efficiently utilizing trajectory\nattention. Originally designed for video classification, trajectory attention\nlearns to model the temporal correspondences between neighboring frames and\naggregates information along the estimated motion paths. However, it is\nnontrivial to directly extend trajectory attention to the per-pixel dense\nprediction tasks due to its quadratic dependency on input size. To alleviate\nthe issue, we propose to adapt the trajectory attention for both the dense\npixel features and object queries, aiming to improve the short-term and\nlong-term tracking results, respectively. Particularly, in our within-clip\ntracking module, we propose axial-trajectory attention that effectively\ncomputes the trajectory attention for tracking dense pixels sequentially along\nthe height- and width-axes. The axial decomposition significantly reduces the\ncomputational complexity for dense pixel features. In our cross-clip tracking\nmodule, since the object queries in mask transformer are learned to encode the\nobject information, we are able to capture the long-term temporal connections\nby applying trajectory attention to object queries, which learns to track each\nobject across different clips. Without bells and whistles, MaXTron demonstrates\nstate-of-the-art performances on video segmentation benchmarks.\n","authors":["Ju He","Qihang Yu","Inkyu Shin","Xueqing Deng","Xiaohui Shen","Alan Yuille","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2311.18537v1.pdf","comment":"Code at https://github.com/TACJu/MaXTron"},{"id":"http://arxiv.org/abs/2311.18531v1","updated":"2023-11-30T13:15:28Z","published":"2023-11-30T13:15:28Z","title":"Dataset Distillation via the Wasserstein Metric","summary":" Dataset distillation (DD) offers a compelling approach in computer vision,\nwith the goal of condensing extensive datasets into smaller synthetic versions\nwithout sacrificing much of the model performance. In this paper, we continue\nto study the methods for DD, by addressing its conceptually core objective: how\nto capture the essential representation of extensive datasets in smaller,\nsynthetic forms.\n We propose a novel approach utilizing the Wasserstein distance, a metric\nrooted in optimal transport theory, to enhance distribution matching in DD. Our\nmethod leverages the Wasserstein barycenter, offering a geometrically\nmeaningful way to quantify distribution differences and effectively capture the\ncentroid of a set of distributions. Our approach retains the computational\nbenefits of distribution matching-based methods while achieving new\nstate-of-the-art performance on several benchmarks.\n To provide useful prior for learning the images, we embed the synthetic data\ninto the feature space of pretrained classification models to conduct\ndistribution matching. Extensive testing on various high-resolution datasets\nconfirms the effectiveness and adaptability of our method, indicating the\npromising yet unexplored capabilities of Wasserstein metrics in dataset\ndistillation.\n","authors":["Haoyang Liu","Tiancheng Xing","Luwei Li","Vibhu Dalal","Jingrui He","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18531v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.15773v2","updated":"2023-11-30T13:14:28Z","published":"2023-11-27T12:48:33Z","title":"Check, Locate, Rectify: A Training-Free Layout Calibration System for\n Text-to-Image Generation","summary":" Diffusion models have recently achieved remarkable progress in generating\nrealistic images. However, challenges remain in accurately understanding and\nsynthesizing the layout requirements in the textual prompts. To align the\ngenerated image with layout instructions, we present a training-free layout\ncalibration system SimM that intervenes in the generative process on the fly\nduring inference time. Specifically, following a \"check-locate-rectify\"\npipeline, the system first analyses the prompt to generate the target layout\nand compares it with the intermediate outputs to automatically detect errors.\nThen, by moving the located activations and making intra- and inter-map\nadjustments, the rectification process can be performed with negligible\ncomputational overhead. To evaluate SimM over a range of layout requirements,\nwe present a benchmark SimMBench that compensates for the lack of superlative\nspatial relations in existing datasets. And both quantitative and qualitative\nresults demonstrate the effectiveness of the proposed SimM in calibrating the\nlayout inconsistencies. Our project page is at https://simm-t2i.github.io/SimM.\n","authors":["Biao Gong","Siteng Huang","Yutong Feng","Shiwei Zhang","Yuyuan Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01681v2","updated":"2023-11-30T13:08:34Z","published":"2023-03-03T02:52:28Z","title":"Dense Pixel-to-Pixel Harmonization via Continuous Image Representation","summary":" High-resolution (HR) image harmonization is of great significance in\nreal-world applications such as image synthesis and image editing. However, due\nto the high memory costs, existing dense pixel-to-pixel harmonization methods\nare mainly focusing on processing low-resolution (LR) images. Some recent works\nresort to combining with color-to-color transformations but are either limited\nto certain resolutions or heavily depend on hand-crafted image filters. In this\nwork, we explore leveraging the implicit neural representation (INR) and\npropose a novel image Harmonization method based on Implicit neural Networks\n(HINet), which to the best of our knowledge, is the first dense pixel-to-pixel\nmethod applicable to HR images without any hand-crafted filter design. Inspired\nby the Retinex theory, we decouple the MLPs into two parts to respectively\ncapture the content and environment of composite images. A Low-Resolution Image\nPrior (LRIP) network is designed to alleviate the Boundary Inconsistency\nproblem, and we also propose new designs for the training and inference\nprocess. Extensive experiments have demonstrated the effectiveness of our\nmethod compared with state-of-the-art methods. Furthermore, some interesting\nand practical applications of the proposed method are explored. Our code is\navailable at https://github.com/WindVChen/INR-Harmonization.\n","authors":["Jianqi Chen","Yilan Zhang","Zhengxia Zou","Keyan Chen","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2303.01681v2.pdf","comment":"Accepted by IEEE Transactions on Circuits and Systems for Video\n Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2210.04020v2","updated":"2023-11-30T13:05:34Z","published":"2022-10-08T13:14:02Z","title":"Fast-ParC: Capturing Position Aware Global Feature for ConvNets and ViTs","summary":" Transformer models have made tremendous progress in various fields in recent\nyears. In the field of computer vision, vision transformers (ViTs) also become\nstrong alternatives to convolutional neural networks (ConvNets), yet they have\nnot been able to replace ConvNets since both have their own merits. For\ninstance, ViTs are good at extracting global features with attention mechanisms\nwhile ConvNets are more efficient in modeling local relationships due to their\nstrong inductive bias. A natural idea that arises is to combine the strengths\nof both ConvNets and ViTs to design new structures. In this paper, we propose a\nnew basic neural network operator named position-aware circular convolution\n(ParC) and its accelerated version Fast-ParC. The ParC operator can capture\nglobal features by using a global kernel and circular convolution while keeping\nlocation sensitiveness by employing position embeddings. Our Fast-ParC further\nreduces the O(n2) time complexity of ParC to O(n log n) using Fast Fourier\nTransform. This acceleration makes it possible to use global convolution in the\nearly stages of models with large feature maps, yet still maintains the overall\ncomputational cost comparable with using 3x3 or 7x7 kernels. The proposed\noperation can be used in a plug-and-play manner to 1) convert ViTs to\npure-ConvNet architecture to enjoy wider hardware support and achieve higher\ninference speed; 2) replacing traditional convolutions in the deep stage of\nConvNets to improve accuracy by enlarging the effective receptive field.\nExperiment results show that our ParC op can effectively enlarge the receptive\nfield of traditional ConvNets, and adopting the proposed op benefits both ViTs\nand ConvNet models on all three popular vision tasks, image classification,\nobject\n","authors":["Tao Yang","Haokui Zhang","Wenze Hu","Changwen Chen","Xiaoyu Wang"],"pdf_url":"https://arxiv.org/pdf/2210.04020v2.pdf","comment":"22 pages, 8 figures, 10 tables. A preliminary version of this paper\n has been published in ECCV 2022 and it can be find in arXiv:2203.03952"},{"id":"http://arxiv.org/abs/2311.17834v2","updated":"2023-11-30T12:59:21Z","published":"2023-11-29T17:36:49Z","title":"SPiC-E : Structural Priors in 3D Diffusion Models using Cross-Entity\n Attention","summary":" We are witnessing rapid progress in automatically generating and manipulating\n3D assets due to the availability of pretrained text-image diffusion models.\nHowever, time-consuming optimization procedures are required for synthesizing\neach sample, hindering their potential for democratizing 3D content creation.\nConversely, 3D diffusion models now train on million-scale 3D datasets,\nyielding high-quality text-conditional 3D samples within seconds. In this work,\nwe present SPiC-E - a neural network that adds structural guidance to 3D\ndiffusion models, extending their usage beyond text-conditional generation. At\nits core, our framework introduces a cross-entity attention mechanism that\nallows for multiple entities (in particular, paired input and guidance 3D\nshapes) to interact via their internal representations within the denoising\nnetwork. We utilize this mechanism for learning task-specific structural priors\nin 3D diffusion models from auxiliary guidance shapes. We show that our\napproach supports a variety of applications, including 3D stylization, semantic\nshape editing and text-conditional abstraction-to-3D, which transforms\nprimitive-based abstractions into highly-expressive shapes. Extensive\nexperiments demonstrate that SPiC-E achieves SOTA performance over these tasks\nwhile often being considerably faster than alternative methods. Importantly,\nthis is accomplished without tailoring our approach for any specific task.\n","authors":["Etai Sella","Gal Fiebelman","Noam Atia","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2311.17834v2.pdf","comment":"Project webpage: https://tau-vailab.github.io/spic-e"},{"id":"http://arxiv.org/abs/2311.03054v3","updated":"2023-11-30T12:54:05Z","published":"2023-11-06T12:10:43Z","title":"AnyText: Multilingual Visual Text Generation And Editing","summary":" Diffusion model based Text-to-Image has achieved impressive achievements\nrecently. Although current technology for synthesizing images is highly\nadvanced and capable of generating images with high fidelity, it is still\npossible to give the show away when focusing on the text area in the generated\nimage. To address this issue, we introduce AnyText, a diffusion-based\nmultilingual visual text generation and editing model, that focuses on\nrendering accurate and coherent text in the image. AnyText comprises a\ndiffusion pipeline with two primary elements: an auxiliary latent module and a\ntext embedding module. The former uses inputs like text glyph, position, and\nmasked image to generate latent features for text generation or editing. The\nlatter employs an OCR model for encoding stroke data as embeddings, which blend\nwith image caption embeddings from the tokenizer to generate texts that\nseamlessly integrate with the background. We employed text-control diffusion\nloss and text perceptual loss for training to further enhance writing accuracy.\nAnyText can write characters in multiple languages, to the best of our\nknowledge, this is the first work to address multilingual visual text\ngeneration. It is worth mentioning that AnyText can be plugged into existing\ndiffusion models from the community for rendering or editing text accurately.\nAfter conducting extensive evaluation experiments, our method has outperformed\nall other approaches by a significant margin. Additionally, we contribute the\nfirst large-scale multilingual text images dataset, AnyWord-3M, containing 3\nmillion image-text pairs with OCR annotations in multiple languages. Based on\nAnyWord-3M dataset, we propose AnyText-benchmark for the evaluation of visual\ntext generation accuracy and quality. Our project will be open-sourced on\nhttps://github.com/tyxsspa/AnyText to improve and promote the development of\ntext generation technology.\n","authors":["Yuxiang Tuo","Wangmeng Xiang","Jun-Yan He","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2311.03054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18518v1","updated":"2023-11-30T12:49:11Z","published":"2023-11-30T12:49:11Z","title":"Color-Emotion Associations in Art: Fuzzy Approach","summary":" Art objects can evoke certain emotions. Color is a fundamental element of\nvisual art and plays a significant role in how art is perceived. This paper\nintroduces a novel approach to classifying emotions in art using Fuzzy Sets. We\nemploy a fuzzy approach because it aligns well with human judgments' imprecise\nand subjective nature. Extensive fuzzy colors (n=120) and a broad emotional\nspectrum (n=10) allow for a more human-consistent and context-aware exploration\nof emotions inherent in paintings. First, we introduce the fuzzy color\nrepresentation model. Then, at the fuzzification stage, we process the Wiki Art\nDataset of paintings tagged with emotions, extracting fuzzy dominant colors\nlinked to specific emotions. This results in fuzzy color distributions for ten\nemotions. Finally, we convert them back to a crisp domain, obtaining a\nknowledge base of color-emotion associations in primary colors. Our findings\nreveal strong associations between specific emotions and colors; for instance,\ngratitude strongly correlates with green, brown, and orange. Other noteworthy\nassociations include brown and anger, orange with shame, yellow with happiness,\nand gray with fear. Using these associations and Jaccard similarity, we can\nfind the emotions in the arbitrary untagged image. We conducted a 2AFC\nexperiment involving human subjects to evaluate the proposed method. The\naverage hit rate of 0.77 indicates a significant correlation between the\nmethod's predictions and human perception. The proposed method is simple to\nadapt to art painting retrieval systems. The study contributes to the\ntheoretical understanding of color-emotion associations in art, offering\nvaluable insights for various practical applications besides art, like\nmarketing, design, and psychology.\n","authors":["Pakizar Shamoi","Muragul Muratbekova"],"pdf_url":"https://arxiv.org/pdf/2311.18518v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2311.18512v1","updated":"2023-11-30T12:40:23Z","published":"2023-11-30T12:40:23Z","title":"Revisiting Proposal-based Object Detection","summary":" This paper revisits the pipeline for detecting objects in images with\nproposals. For any object detector, the obtained box proposals or queries need\nto be classified and regressed towards ground truth boxes. The common solution\nfor the final predictions is to directly maximize the overlap between each\nproposal and the ground truth box, followed by a winner-takes-all ranking or\nnon-maximum suppression. In this work, we propose a simple yet effective\nalternative. For proposal regression, we solve a simpler problem where we\nregress to the area of intersection between proposal and ground truth. In this\nway, each proposal only specifies which part contains the object, avoiding a\nblind inpainting problem where proposals need to be regressed beyond their\nvisual scope. In turn, we replace the winner-takes-all strategy and obtain the\nfinal prediction by taking the union over the regressed intersections of a\nproposal group surrounding an object. Our revisited approach comes with minimal\nchanges to the detection pipeline and can be plugged into any existing method.\nWe show that our approach directly improves canonical object detection and\ninstance segmentation architectures, highlighting the utility of\nintersection-based regression and grouping.\n","authors":["Aritra Bhowmik","Martin R. Oswald","Pascal Mettes","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2311.18512v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.18508v1","updated":"2023-11-30T12:37:53Z","published":"2023-11-30T12:37:53Z","title":"DifAugGAN: A Practical Diffusion-style Data Augmentation for GAN-based\n Single Image Super-resolution","summary":" It is well known the adversarial optimization of GAN-based image\nsuper-resolution (SR) methods makes the preceding SR model generate unpleasant\nand undesirable artifacts, leading to large distortion. We attribute the cause\nof such distortions to the poor calibration of the discriminator, which hampers\nits ability to provide meaningful feedback to the generator for learning\nhigh-quality images. To address this problem, we propose a simple but\nnon-travel diffusion-style data augmentation scheme for current GAN-based SR\nmethods, known as DifAugGAN. It involves adapting the diffusion process in\ngenerative diffusion models for improving the calibration of the discriminator\nduring training motivated by the successes of data augmentation schemes in the\nfield to achieve good calibration. Our DifAugGAN can be a Plug-and-Play\nstrategy for current GAN-based SISR methods to improve the calibration of the\ndiscriminator and thus improve SR performance. Extensive experimental\nevaluations demonstrate the superiority of DifAugGAN over state-of-the-art\nGAN-based SISR methods across both synthetic and real-world datasets,\nshowcasing notable advancements in both qualitative and quantitative results.\n","authors":["Axi Niu","Kang Zhang","Joshua Tian Jin Tee","Trung X. Pham","Jinqiu Sun","Chang D. Yoo","In So Kweon","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18496v1","updated":"2023-11-30T12:17:16Z","published":"2023-11-30T12:17:16Z","title":"Accurate Segmentation of Optic Disc And Cup from Multiple Pseudo-labels\n by Noise-Aware Learning","summary":" Optic disc and cup segmentation play a crucial role in automating the\nscreening and diagnosis of optic glaucoma. While data-driven convolutional\nneural networks (CNNs) show promise in this area, the inherent ambiguity of\nsegmenting object and background boundaries in the task of optic disc and cup\nsegmentation leads to noisy annotations that impact model performance. To\naddress this, we propose an innovative label-denoising method of Multiple\nPseudo-labels Noise-aware Network (MPNN) for accurate optic disc and cup\nsegmentation. Specifically, the Multiple Pseudo-labels Generation and Guided\nDenoising (MPGGD) module generates pseudo-labels by multiple different\ninitialization networks trained on true labels, and the pixel-level consensus\ninformation extracted from these pseudo-labels guides to differentiate clean\npixels from noisy pixels. The training framework of the MPNN is constructed by\na teacher-student architecture to learn segmentation from clean pixels and\nnoisy pixels. Particularly, such a framework adeptly leverages (i) reliable and\nfundamental insights from clean pixels and (ii) the supplementary knowledge\nwithin noisy pixels via multiple perturbation-based unsupervised consistency.\nCompared to other label-denoising methods, comprehensive experimental results\non the RIGA dataset demonstrate our method's excellent performance and\nsignificant denoising ability.\n","authors":["Tengjin Weng","Yang Shen","Zhidong Zhao","Zhiming Cheng","Shuai Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18496v1.pdf","comment":"to CSCWD 2024"},{"id":"http://arxiv.org/abs/2311.18495v1","updated":"2023-11-30T12:15:49Z","published":"2023-11-30T12:15:49Z","title":"Improving Adversarial Transferability via Model Alignment","summary":" Neural networks are susceptible to adversarial perturbations that are\ntransferable across different models. In this paper, we introduce a novel model\nalignment technique aimed at improving a given source model's ability in\ngenerating transferable adversarial perturbations. During the alignment\nprocess, the parameters of the source model are fine-tuned to minimize an\nalignment loss. This loss measures the divergence in the predictions between\nthe source model and another, independently trained model, referred to as the\nwitness model. To understand the effect of model alignment, we conduct a\ngeometric anlaysis of the resulting changes in the loss landscape. Extensive\nexperiments on the ImageNet dataset, using a variety of model architectures,\ndemonstrate that perturbations generated from aligned source models exhibit\nsignificantly higher transferability than those from the original source model.\n","authors":["Avery Ma","Amir-massoud Farahmand","Yangchen Pan","Philip Torr","Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2311.18495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18494v1","updated":"2023-11-30T12:15:45Z","published":"2023-11-30T12:15:45Z","title":"PRS: Sharp Feature Priors for Resolution-Free Surface Remeshing","summary":" Surface reconstruction with preservation of geometric features is a\nchallenging computer vision task. Despite significant progress in implicit\nshape reconstruction, state-of-the-art mesh extraction methods often produce\naliased, perceptually distorted surfaces and lack scalability to\nhigh-resolution 3D shapes. We present a data-driven approach for automatic\nfeature detection and remeshing that requires only a coarse, aliased mesh as\ninput and scales to arbitrary resolution reconstructions. We define and learn a\ncollection of surface-based fields to (1) capture sharp geometric features in\nthe shape with an implicit vertexwise model and (2) approximate improvements in\nnormals alignment obtained by applying edge-flips with an edgewise model. To\nsupport scaling to arbitrary complexity shapes, we learn our fields using local\ntriangulated patches, fusing estimates on complete surface meshes. Our feature\nremeshing algorithm integrates the learned fields as sharp feature priors and\noptimizes vertex placement and mesh connectivity for maximum expected surface\nimprovement. On a challenging collection of high-resolution shape\nreconstructions in the ABC dataset, our algorithm improves over\nstate-of-the-art by 26% normals F-score and 42% perceptual\n$\\text{RMSE}_{\\text{v}}$.\n","authors":["Natalia Soboleva","Olga Gorbunova","Maria Ivanova","Evgeny Burnaev","Matthias Nießner","Denis Zorin","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2311.18494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02582v3","updated":"2023-11-30T12:10:34Z","published":"2023-06-05T04:21:00Z","title":"Enhancing Point Annotations with Superpixel and Confidence Learning\n Guided for Improving Semi-Supervised OCT Fluid Segmentation","summary":" Automatic segmentation of fluid in Optical Coherence Tomography (OCT) images\nis beneficial for ophthalmologists to make an accurate diagnosis. Although\nsemi-supervised OCT fluid segmentation networks enhance their performance by\nintroducing additional unlabeled data, the performance enhancement is limited.\nTo address this, we propose Superpixel and Confident Learning Guide Point\nAnnotations Network (SCLGPA-Net) based on the teacher-student architecture,\nwhich can learn OCT fluid segmentation from limited fully-annotated data and\nabundant point-annotated data. Specifically, we use points to annotate fluid\nregions in unlabeled OCT images and the Superpixel-Guided Pseudo-Label\nGeneration (SGPLG) module generates pseudo-labels and pixel-level label trust\nmaps from the point annotations. The label trust maps provide an indication of\nthe reliability of the pseudo-labels. Furthermore, we propose the Confident\nLearning Guided Label Refinement (CLGLR) module identifies error information in\nthe pseudo-labels and leads to further refinement. Experiments on the RETOUCH\ndataset show that we are able to reduce the need for fully-annotated data by\n94.22\\%, closing the gap with the best fully supervised baselines to a mean IoU\nof only 2\\%. Furthermore, We constructed a private 2D OCT fluid segmentation\ndataset for evaluation. Compared with other methods, comprehensive experimental\nresults demonstrate that the proposed method can achieve excellent performance\nin OCT fluid segmentation.\n","authors":["Tengjin Weng","Yang Shen","Kai Jin","Zhiming Cheng","Yunxiang Li","Gewen Zhang","Shuai Wang","Yaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2306.02582v3.pdf","comment":"Submission to BSPC"},{"id":"http://arxiv.org/abs/2311.12704v2","updated":"2023-11-30T12:07:23Z","published":"2023-11-21T16:19:14Z","title":"Cascade Learning Localises Discriminant Features in Visual Scene\n Classification","summary":" Lack of interpretability of deep convolutional neural networks (DCNN) is a\nwell-known problem particularly in the medical domain as clinicians want\ntrustworthy automated decisions. One way to improve trust is to demonstrate the\nlocalisation of feature representations with respect to expert labeled regions\nof interest. In this work, we investigate the localisation of features learned\nvia two varied learning paradigms and demonstrate the superiority of one\nlearning approach with respect to localisation. Our analysis on medical and\nnatural datasets show that the traditional end-to-end (E2E) learning strategy\nhas a limited ability to localise discriminative features across multiple\nnetwork layers. We show that a layer-wise learning strategy, namely cascade\nlearning (CL), results in more localised features. Considering localisation\naccuracy, we not only show that CL outperforms E2E but that it is a promising\nmethod of predicting regions. On the YOLO object detection framework, our best\nresult shows that CL outperforms the E2E scheme by $2\\%$ in mAP.\n","authors":["Junwen Wang","Katayoun Farrahi"],"pdf_url":"https://arxiv.org/pdf/2311.12704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18491v1","updated":"2023-11-30T12:06:15Z","published":"2023-11-30T12:06:15Z","title":"ZeST-NeRF: Using temporal aggregation for Zero-Shot Temporal NeRFs","summary":" In the field of media production, video editing techniques play a pivotal\nrole. Recent approaches have had great success at performing novel view image\nsynthesis of static scenes. But adding temporal information adds an extra layer\nof complexity. Previous models have focused on implicitly representing static\nand dynamic scenes using NeRF. These models achieve impressive results but are\ncostly at training and inference time. They overfit an MLP to describe the\nscene implicitly as a function of position. This paper proposes ZeST-NeRF, a\nnew approach that can produce temporal NeRFs for new scenes without retraining.\nWe can accurately reconstruct novel views using multi-view synthesis techniques\nand scene flow-field estimation, trained only with unrelated scenes. We\ndemonstrate how existing state-of-the-art approaches from a range of fields\ncannot adequately solve this new task and demonstrate the efficacy of our\nsolution. The resulting network improves quantitatively by 15% and produces\nsignificantly better visual results.\n","authors":["Violeta Menéndez González","Andrew Gilbert","Graeme Phillipson","Stephen Jolly","Simon Hadfield"],"pdf_url":"https://arxiv.org/pdf/2311.18491v1.pdf","comment":"VUA BMVC 2023"},{"id":"http://arxiv.org/abs/2308.09922v2","updated":"2023-11-30T11:58:12Z","published":"2023-08-19T06:21:22Z","title":"MDCS: More Diverse Experts with Consistency Self-distillation for\n Long-tailed Recognition","summary":" Recently, multi-expert methods have led to significant improvements in\nlong-tail recognition (LTR). We summarize two aspects that need further\nenhancement to contribute to LTR boosting: (1) More diverse experts; (2) Lower\nmodel variance. However, the previous methods didn't handle them well. To this\nend, we propose More Diverse experts with Consistency Self-distillation (MDCS)\nto bridge the gap left by earlier methods. Our MDCS approach consists of two\ncore components: Diversity Loss (DL) and Consistency Self-distillation (CS). In\ndetail, DL promotes diversity among experts by controlling their focus on\ndifferent categories. To reduce the model variance, we employ KL divergence to\ndistill the richer knowledge of weakly augmented instances for the experts'\nself-distillation. In particular, we design Confident Instance Sampling (CIS)\nto select the correctly classified instances for CS to avoid biased/noisy\nknowledge. In the analysis and ablation study, we demonstrate that our method\ncompared with previous work can effectively increase the diversity of experts,\nsignificantly reduce the variance of the model, and improve recognition\naccuracy. Moreover, the roles of our DL and CS are mutually reinforcing and\ncoupled: the diversity of experts benefits from the CS, and the CS cannot\nachieve remarkable results without the DL. Experiments show our MDCS\noutperforms the state-of-the-art by 1% $\\sim$ 2% on five popular long-tailed\nbenchmarks, including CIFAR10-LT, CIFAR100-LT, ImageNet-LT, Places-LT, and\niNaturalist 2018. The code is available at https://github.com/fistyee/MDCS.\n","authors":["Qihao Zhao","Chen Jiang","Wei Hu","Fan Zhang","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09922v2.pdf","comment":"ICCV2023 Accept. 13 pages"},{"id":"http://arxiv.org/abs/2311.18482v1","updated":"2023-11-30T11:50:07Z","published":"2023-11-30T11:50:07Z","title":"Language Embedded 3D Gaussians for Open-Vocabulary Scene Understanding","summary":" Open-vocabulary querying in 3D space is challenging but essential for scene\nunderstanding tasks such as object localization and segmentation.\nLanguage-embedded scene representations have made progress by incorporating\nlanguage features into 3D spaces. However, their efficacy heavily depends on\nneural networks that are resource-intensive in training and rendering. Although\nrecent 3D Gaussians offer efficient and high-quality novel view synthesis,\ndirectly embedding language features in them leads to prohibitive memory usage\nand decreased performance. In this work, we introduce Language Embedded 3D\nGaussians, a novel scene representation for open-vocabulary query tasks.\nInstead of embedding high-dimensional raw semantic features on 3D Gaussians, we\npropose a dedicated quantization scheme that drastically alleviates the memory\nrequirement, and a novel embedding procedure that achieves smoother yet high\naccuracy query, countering the multi-view feature inconsistencies and the\nhigh-frequency inductive bias in point-based representations. Our comprehensive\nexperiments show that our representation achieves the best visual quality and\nlanguage querying accuracy across current language-embedded representations,\nwhile maintaining real-time rendering frame rates on a single desktop GPU.\n","authors":["Jin-Chuan Shi","Miao Wang","Hao-Bin Duan","Shao-Hua Guan"],"pdf_url":"https://arxiv.org/pdf/2311.18482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18481v1","updated":"2023-11-30T11:47:50Z","published":"2023-11-30T11:47:50Z","title":"ESG Accountability Made Easy: DocQA at Your Service","summary":" We present Deep Search DocQA. This application enables information extraction\nfrom documents via a question-answering conversational assistant. The system\nintegrates several technologies from different AI disciplines consisting of\ndocument conversion to machine-readable format (via computer vision), finding\nrelevant data (via natural language processing), and formulating an eloquent\nresponse (via large language models). Users can explore over 10,000\nEnvironmental, Social, and Governance (ESG) disclosure reports from over 2000\ncorporations. The Deep Search platform can be accessed at:\nhttps://ds4sd.github.io.\n","authors":["Lokesh Mishra","Cesar Berrospi","Kasper Dinkla","Diego Antognini","Francesco Fusco","Benedikt Bothur","Maksym Lysak","Nikolaos Livathinos","Ahmed Nassar","Panagiotis Vagenas","Lucas Morin","Christoph Auer","Michele Dolfi","Peter Staar"],"pdf_url":"https://arxiv.org/pdf/2311.18481v1.pdf","comment":"Accepted at the Demonstration Track of the 38th Annual AAAI\n Conference on Artificial Intelligence (AAAI 24)"},{"id":"http://arxiv.org/abs/2111.12727v3","updated":"2023-11-30T11:47:36Z","published":"2021-11-24T19:00:05Z","title":"Generating More Pertinent Captions by Leveraging Semantics and Style on\n Multi-Source Datasets","summary":" This paper addresses the task of generating fluent descriptions by training\non a non-uniform combination of data sources, containing both human-annotated\nand web-collected captions. Large-scale datasets with noisy image-text pairs,\nindeed, provide a sub-optimal source of supervision because of their\nlow-quality descriptive style, while human-annotated datasets are cleaner but\nsmaller in scale. To get the best of both worlds, we propose to leverage and\nseparate semantics and descriptive style through the incorporation of a style\ntoken and keywords extracted through a retrieval component. The proposed model\navoids the need of object detectors, is trained with a single objective of\nprompt language modeling, and can replicate the style of human-collected\ncaptions while training on sources with different input styles. Experimentally,\nthe model shows a strong capability of recognizing real-world concepts and\nproducing high-quality captions. Extensive experiments are performed on\ndifferent image captioning datasets, including CC3M, nocaps, and the\ncompetitive COCO dataset, where our model consistently outperforms baselines\nand state-of-the-art approaches.\n","authors":["Marcella Cornia","Lorenzo Baraldi","Giuseppe Fiameni","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2111.12727v3.pdf","comment":"Accepted to IJCV"},{"id":"http://arxiv.org/abs/2311.17812v2","updated":"2023-11-30T11:28:59Z","published":"2023-11-29T17:03:37Z","title":"DAP: Domain-aware Prompt Learning for Vision-and-Language Navigation","summary":" Following language instructions to navigate in unseen environments is a\nchallenging task for autonomous embodied agents. With strong representation\ncapabilities, pretrained vision-and-language models are widely used in VLN.\nHowever, most of them are trained on web-crawled general-purpose datasets,\nwhich incurs a considerable domain gap when used for VLN tasks. To address the\nproblem, we propose a novel and model-agnostic domain-aware prompt learning\n(DAP) framework. For equipping the pretrained models with specific object-level\nand scene-level cross-modal alignment in VLN tasks, DAP applies a low-cost\nprompt tuning paradigm to learn soft visual prompts for extracting in-domain\nimage semantics. Specifically, we first generate a set of in-domain image-text\npairs with the help of the CLIP model. Then we introduce soft visual prompts in\nthe input space of the visual encoder in a pretrained model. DAP injects\nin-domain visual knowledge into the visual encoder of the pretrained model in\nan efficient way. Experimental results on both R2R and REVERIE show the\nsuperiority of DAP compared to existing state-of-the-art methods.\n","authors":["Ting Liu","Yue Hu","Wansen Wu","Youkai Wang","Kai Xu","Quanjun Yin"],"pdf_url":"https://arxiv.org/pdf/2311.17812v2.pdf","comment":"4 pages. arXiv admin note: substantial text overlap with\n arXiv:2309.03661"},{"id":"http://arxiv.org/abs/2309.03661v2","updated":"2023-11-30T11:25:33Z","published":"2023-09-07T11:58:34Z","title":"Prompt-based Context- and Domain-aware Pretraining for Vision and\n Language Navigation","summary":" With strong representation capabilities, pretrained vision-language models\nare widely used in vision and language navigation (VLN). However, most of them\nare trained on web-crawled general-purpose datasets, which incurs a\nconsiderable domain gap when used for VLN tasks. Another challenge for VLN is\nhow the agent understands the contextual relations between actions on a\ntrajectory and performs cross-modal alignment sequentially. In this paper, we\npropose a novel Prompt-bAsed coNtext- and Domain-Aware (PANDA) pretraining\nframework to address these problems. It performs prompting in two stages. In\nthe domain-aware stage, we apply a low-cost prompt tuning paradigm to learn\nsoft visual prompts from an in-domain dataset for equipping the pretrained\nmodels with object-level and scene-level cross-modal alignment in VLN tasks.\nFurthermore, in the context-aware stage, we design a set of hard context\nprompts to capture the sequence-level semantics and instill both out-of-context\nand contextual knowledge in the instruction into cross-modal representations.\nThey enable further tuning of the pretrained models via contrastive learning.\nExperimental results on both R2R and REVERIE show the superiority of PANDA\ncompared to previous state-of-the-art methods.\n","authors":["Ting Liu","Wansen Wu","Yue Hu","Youkai Wang","Kai Xu","Quanjun Yin"],"pdf_url":"https://arxiv.org/pdf/2309.03661v2.pdf","comment":"the paper has some wrong,and we hope withdrawal it"},{"id":"http://arxiv.org/abs/2307.11702v3","updated":"2023-11-30T11:22:53Z","published":"2023-07-21T16:56:36Z","title":"SACReg: Scene-Agnostic Coordinate Regression for Visual Localization","summary":" Scene coordinates regression (SCR), i.e., predicting 3D coordinates for every\npixel of a given image, has recently shown promising potential. However,\nexisting methods remain limited to small scenes memorized during training, and\nthus hardly scale to realistic datasets and scenarios. In this paper, we\npropose a generalized SCR model trained once to be deployed in new test scenes,\nregardless of their scale, without any finetuning. Instead of encoding the\nscene coordinates into the network weights, our model takes as input a database\nimage with some sparse 2D pixel to 3D coordinate annotations, extracted from\ne.g. off-the-shelf Structure-from-Motion or RGB-D data, and a query image for\nwhich are predicted a dense 3D coordinate map and its confidence, based on\ncross-attention. At test time, we rely on existing off-the-shelf image\nretrieval systems and fuse the predictions from a shortlist of relevant\ndatabase images w.r.t. the query. Afterwards camera pose is obtained using\nstandard Perspective-n-Point (PnP). Starting from selfsupervised CroCo\npretrained weights, we train our model on diverse datasets to ensure\ngeneralizabilty across various scenarios, and significantly outperform other\nscene regression approaches, including scene-specific models, on multiple\nvisual localization benchmarks. Finally, we show that the database\nrepresentation of images and their 2D-3D annotations can be highly compressed\nwith negligible loss of localization performance.\n","authors":["Jerome Revaud","Yohann Cabon","Romain Brégier","JongMin Lee","Philippe Weinzaepfel"],"pdf_url":"https://arxiv.org/pdf/2307.11702v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06841v3","updated":"2023-11-30T11:18:14Z","published":"2022-11-13T08:02:03Z","title":"Point-DAE: Denoising Autoencoders for Self-supervised Point Cloud\n Learning","summary":" Masked autoencoder has demonstrated its effectiveness in self-supervised\npoint cloud learning. Considering that masking is a kind of corruption, in this\nwork we explore a more general denoising autoencoder for point cloud learning\n(Point-DAE) by investigating more types of corruptions beyond masking.\nSpecifically, we degrade the point cloud with certain corruptions as input, and\nlearn an encoder-decoder model to reconstruct the original point cloud from its\ncorrupted version. Three corruption families (\\ie, density/masking, noise, and\naffine transformation) and a total of fourteen corruption types are\ninvestigated with traditional non-Transformer encoders. Besides the popular\nmasking corruption, we identify another effective corruption family, \\ie,\naffine transformation. The affine transformation disturbs all points globally,\nwhich is complementary to the masking corruption where some local regions are\ndropped. We also validate the effectiveness of affine transformation corruption\nwith the Transformer backbones, where we decompose the reconstruction of the\ncomplete point cloud into the reconstructions of detailed local patches and\nrough global shape, alleviating the position leakage problem in the\nreconstruction. Extensive experiments on tasks of object classification,\nfew-shot learning, robustness testing, part segmentation, and 3D object\ndetection validate the effectiveness of the proposed method. The codes are\navailable at \\url{https://github.com/YBZh/Point-DAE}.\n","authors":["Yabin Zhang","Jiehong Lin","Ruihuang Li","Kui Jia","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.06841v3.pdf","comment":"The codes are available at \\url{https://github.com/YBZh/Point-DAE}"},{"id":"http://arxiv.org/abs/2310.09912v3","updated":"2023-11-30T11:03:01Z","published":"2023-10-15T18:44:30Z","title":"Unsupervised Discovery of Interpretable Directions in h-space of\n Pre-trained Diffusion Models","summary":" We propose the first unsupervised and learning-based method to identify\ninterpretable directions in h-space of pre-trained diffusion models. Our method\nis derived from an existing technique that operates on the GAN latent space.\nSpecifically, we employ a shift control module that works on h-space of\npre-trained diffusion models to manipulate a sample into a shifted version of\nitself, followed by a reconstructor to reproduce both the type and the strength\nof the manipulation. By jointly optimizing them, the model will spontaneously\ndiscover disentangled and interpretable directions. To prevent the discovery of\nmeaningless and destructive directions, we employ a discriminator to maintain\nthe fidelity of shifted sample. Due to the iterative generative process of\ndiffusion models, our training requires a substantial amount of GPU VRAM to\nstore numerous intermediate tensors for back-propagating gradient. To address\nthis issue, we propose a general VRAM-efficient training algorithm based on\ngradient checkpointing technique to back-propagate any gradient through the\nwhole generative process, with acceptable occupancy of VRAM and sacrifice of\ntraining efficiency. Compared with existing related works on diffusion models,\nour method inherently identifies global and scalable directions, without\nnecessitating any other complicated procedures. Extensive experiments on\nvarious datasets demonstrate the effectiveness of our method.\n","authors":["Zijian Zhang","Luping Liu","Zhijie Lin","Yichen Zhu","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.09912v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18448v1","updated":"2023-11-30T10:50:35Z","published":"2023-11-30T10:50:35Z","title":"HOLD: Category-agnostic 3D Reconstruction of Interacting Hands and\n Objects from Video","summary":" Since humans interact with diverse objects every day, the holistic 3D capture\nof these interactions is important to understand and model human behaviour.\nHowever, most existing methods for hand-object reconstruction from RGB either\nassume pre-scanned object templates or heavily rely on limited 3D hand-object\ndata, restricting their ability to scale and generalize to more unconstrained\ninteraction settings. To this end, we introduce HOLD -- the first\ncategory-agnostic method that reconstructs an articulated hand and object\njointly from a monocular interaction video. We develop a compositional\narticulated implicit model that can reconstruct disentangled 3D hand and object\nfrom 2D images. We also further incorporate hand-object constraints to improve\nhand-object poses and consequently the reconstruction quality. Our method does\nnot rely on 3D hand-object annotations while outperforming fully-supervised\nbaselines in both in-the-lab and challenging in-the-wild settings. Moreover, we\nqualitatively show its robustness in reconstructing from in-the-wild videos.\nCode: https://github.com/zc-alexfan/hold\n","authors":["Zicong Fan","Maria Parelli","Maria Eleni Kadoglou","Muhammed Kocabas","Xu Chen","Michael J. Black","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2311.18448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18445v1","updated":"2023-11-30T10:49:56Z","published":"2023-11-30T10:49:56Z","title":"VTimeLLM: Empower LLM to Grasp Video Moments","summary":" Large language models (LLMs) have shown remarkable text understanding\ncapabilities, which have been extended as Video LLMs to handle video data for\ncomprehending visual details. However, existing Video LLMs can only provide a\ncoarse description of the entire video, failing to capture the precise start\nand end time boundary of specific events. In this paper, we solve this issue\nvia proposing VTimeLLM, a novel Video LLM designed for fine-grained video\nmoment understanding and reasoning with respect to time boundary. Specifically,\nour VTimeLLM adopts a boundary-aware three-stage training strategy, which\nrespectively utilizes image-text pairs for feature alignment, multiple-event\nvideos to increase temporal-boundary awareness, and high-quality\nvideo-instruction tuning to further improve temporal understanding ability as\nwell as align with human intents. Extensive experiments demonstrate that in\nfine-grained time-related comprehension tasks for videos such as Temporal Video\nGrounding and Dense Video Captioning, VTimeLLM significantly outperforms\nexisting Video LLMs. Besides, benefits from the fine-grained temporal\nunderstanding of the videos further enable VTimeLLM to beat existing Video LLMs\nin video dialogue benchmark, showing its superior cross-modal understanding and\nreasoning abilities.\n","authors":["Bin Huang","Xin Wang","Hong Chen","Zihan Song","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.18445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15202v2","updated":"2023-11-30T10:45:11Z","published":"2023-11-26T05:47:01Z","title":"Dual-stream contrastive predictive network with joint handcrafted\n feature view for SAR ship classification","summary":" Most existing synthetic aperture radar (SAR) ship classification technologies\nheavily rely on correctly labeled data, ignoring the discriminative features of\nunlabeled SAR ship images. Even though researchers try to enrich CNN-based\nfeatures by introducing traditional handcrafted features, existing methods\neasily cause information redundancy and fail to capture the interaction between\nthem. To address these issues, we propose a novel dual-stream contrastive\npredictive network (DCPNet), which consists of two asymmetric task designs and\nthe false negative sample elimination module. The first task is to construct\npositive sample pairs, guiding the core encoder to learn more general\nrepresentations. The second task is to encourage adaptive capture of the\ncorrespondence between deep features and handcrated features, achieving\nknowledge transfer within the model, and effectively improving the redundancy\ncaused by the feature fusion. To increase the separability between clusters, we\nalso design a cluster-level tasks. The experimental results on OpenSARShip and\nFUSAR-Ship datasets demonstrate the improvement in classification accuracy of\nsupervised models and confirm the capability of learning effective\nrepresentations of DCPNet.\n","authors":["Xianting Feng","Hao zheng","Zhigang Hu","Liu Yang","Meiguang Zheng"],"pdf_url":"https://arxiv.org/pdf/2311.15202v2.pdf","comment":"6 pages, 3 figures, ICASSP2024"},{"id":"http://arxiv.org/abs/2311.18435v1","updated":"2023-11-30T10:36:19Z","published":"2023-11-30T10:36:19Z","title":"Layered Rendering Diffusion Model for Zero-Shot Guided Image Synthesis","summary":" This paper introduces innovative solutions to enhance spatial controllability\nin diffusion models reliant on text queries. We present two key innovations:\nVision Guidance and the Layered Rendering Diffusion (LRDiff) framework. Vision\nGuidance, a spatial layout condition, acts as a clue in the perturbed\ndistribution, greatly narrowing down the search space, to focus on the image\nsampling process adhering to the spatial layout condition. The LRDiff framework\nconstructs an image-rendering process with multiple layers, each of which\napplies the vision guidance to instructively estimate the denoising direction\nfor a single object. Such a layered rendering strategy effectively prevents\nissues like unintended conceptual blending or mismatches, while allowing for\nmore coherent and contextually accurate image synthesis. The proposed method\nprovides a more efficient and accurate means of synthesising images that align\nwith specific spatial and contextual requirements. We demonstrate through our\nexperiments that our method provides better results than existing techniques\nboth quantitatively and qualitatively. We apply our method to three practical\napplications: bounding box-to-image, semantic mask-to-image and image editing.\n","authors":["Zipeng Qi","Guoxi Huang","Zebin Huang","Qin Guo","Jinwen Chen","Junyu Han","Jian Wang","Gang Zhang","Lufei Liu","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05950v3","updated":"2023-11-30T10:35:40Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities on downstream tasks when fine-tuned with\nminimal data. However, many VLMs rely on proprietary data and are not\nopen-source, which restricts the use of white-box approaches for fine-tuning.\nAs such, we aim to develop a black-box approach to optimize VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or even output logits. We propose employing chat-based LLMs\nto search for the best text prompt for VLMs. Specifically, we adopt an\nautomatic hill-climbing procedure that converges to an effective prompt by\nevaluating the performance of current prompts and asking LLMs to refine them\nbased on textual feedback, all within a conversational process without\nhuman-in-the-loop. In a challenging 1-shot image classification setup, our\nsimple approach surpasses the white-box continuous prompting method (CoOp) by\nan average of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms both human-engineered and LLM-generated prompts. We highlight the\nadvantage of conversational feedback that incorporates both positive and\nnegative prompts, suggesting that LLMs can utilize the implicit gradient\ndirection in textual feedback for a more efficient search. In addition, we find\nthat the text prompts generated through our strategy are not only more\ninterpretable but also transfer well across different VLM architectures in a\nblack-box manner. Lastly, we demonstrate our framework on a state-of-the-art\nblack-box VLM (DALL-E 3) for text-to-image optimization.\n","authors":["Shihong Liu","Zhiqiu Lin","Samuel Yu","Ryan Lee","Tiffany Ling","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v3.pdf","comment":"Project site: llm-can-optimize-vlm.github.io"},{"id":"http://arxiv.org/abs/2311.18433v1","updated":"2023-11-30T10:33:49Z","published":"2023-11-30T10:33:49Z","title":"E2PNet: Event to Point Cloud Registration with Spatio-Temporal\n Representation Learning","summary":" Event cameras have emerged as a promising vision sensor in recent years due\nto their unparalleled temporal resolution and dynamic range. While registration\nof 2D RGB images to 3D point clouds is a long-standing problem in computer\nvision, no prior work studies 2D-3D registration for event cameras. To this\nend, we propose E2PNet, the first learning-based method for event-to-point\ncloud registration. The core of E2PNet is a novel feature representation\nnetwork called Event-Points-to-Tensor (EP2T), which encodes event data into a\n2D grid-shaped feature tensor. This grid-shaped feature enables matured\nRGB-based frameworks to be easily used for event-to-point cloud registration,\nwithout changing hyper-parameters and the training procedure. EP2T treats the\nevent input as spatio-temporal point clouds. Unlike standard 3D learning\narchitectures that treat all dimensions of point clouds equally, the novel\nsampling and information aggregation modules in EP2T are designed to handle the\ninhomogeneity of the spatial and temporal dimensions. Experiments on the MVSEC\nand VECtor datasets demonstrate the superiority of E2PNet over hand-crafted and\nother learning-based methods. Compared to RGB-based registration, E2PNet is\nmore robust to extreme illumination or fast motion due to the use of event\ndata. Beyond 2D-3D registration, we also show the potential of EP2T for other\nvision tasks such as flow estimation, event-to-image reconstruction and object\nrecognition. The source code can be found at:\nhttps://github.com/Xmu-qcj/E2PNet.\n","authors":["Xiuhong Lin","Changjie Qiu","Zhipeng Cai","Siqi Shen","Yu Zang","Weiquan Liu","Xuesheng Bian","Matthias Müller","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18433v1.pdf","comment":"10 pages, 4 figures, accepted by Thirty-seventh Conference on Neural\n Information Processing Systems(NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2304.10253v2","updated":"2023-11-30T10:14:49Z","published":"2023-04-20T12:21:30Z","title":"Image retrieval outperforms diffusion models on data augmentation","summary":" Many approaches have been proposed to use diffusion models to augment\ntraining datasets for downstream tasks, such as classification. However,\ndiffusion models are themselves trained on large datasets, often with noisy\nannotations, and it remains an open question to which extent these models\ncontribute to downstream classification performance. In particular, it remains\nunclear if they generalize enough to improve over directly using the additional\ndata of their pre-training process for augmentation. We systematically evaluate\na range of existing methods to generate images from diffusion models and study\nnew extensions to assess their benefit for data augmentation. Personalizing\ndiffusion models towards the target data outperforms simpler prompting\nstrategies. However, using the pre-training data of the diffusion model alone,\nvia a simple nearest-neighbor retrieval procedure, leads to even stronger\ndownstream performance. Our study explores the potential of diffusion models in\ngenerating new training data, and surprisingly finds that these sophisticated\nmodels are not yet able to beat a simple and strong image retrieval baseline on\nsimple downstream vision tasks.\n","authors":["Max F. Burg","Florian Wenzel","Dominik Zietlow","Max Horn","Osama Makansi","Francesco Locatello","Chris Russell"],"pdf_url":"https://arxiv.org/pdf/2304.10253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18420v1","updated":"2023-11-30T10:13:46Z","published":"2023-11-30T10:13:46Z","title":"TeG-DG: Textually Guided Domain Generalization for Face Anti-Spoofing","summary":" Enhancing the domain generalization performance of Face Anti-Spoofing (FAS)\ntechniques has emerged as a research focus. Existing methods are dedicated to\nextracting domain-invariant features from various training domains. Despite the\npromising performance, the extracted features inevitably contain residual style\nfeature bias (e.g., illumination, capture device), resulting in inferior\ngeneralization performance. In this paper, we propose an alternative and\neffective solution, the Textually Guided Domain Generalization (TeG-DG)\nframework, which can effectively leverage text information for cross-domain\nalignment. Our core insight is that text, as a more abstract and universal form\nof expression, can capture the commonalities and essential characteristics\nacross various attacks, bridging the gap between different image domains.\nContrary to existing vision-language models, the proposed framework is\nelaborately designed to enhance the domain generalization ability of the FAS\ntask. Concretely, we first design a Hierarchical Attention Fusion (HAF) module\nto enable adaptive aggregation of visual features at different levels; Then, a\nTextual-Enhanced Visual Discriminator (TEVD) is proposed for not only better\nalignment between the two modalities but also to regularize the classifier with\nunbiased text features. TeG-DG significantly outperforms previous approaches,\nespecially in situations with extremely limited source domain data (~14% and\n~12% improvements on HTER and AUC respectively), showcasing impressive few-shot\nperformance.\n","authors":["Lianrui Mu","Jianhong Bai","Xiaoxuan He","Jiangnan Ye","Xiaoyu Liang","Yuchen Yang","Jiedong Zhuang","Haoji Hu"],"pdf_url":"https://arxiv.org/pdf/2311.18420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18405v1","updated":"2023-11-30T09:56:17Z","published":"2023-11-30T09:56:17Z","title":"CAT-DM: Controllable Accelerated Virtual Try-on with Diffusion Model","summary":" Image-based virtual try-on enables users to virtually try on different\ngarments by altering original clothes in their photographs. Generative\nAdversarial Networks (GANs) dominate the research field in image-based virtual\ntry-on, but have not resolved problems such as unnatural deformation of\ngarments and the blurry generation quality. Recently, diffusion models have\nemerged with surprising performance across various image generation tasks.\nWhile the generative quality of diffusion models is impressive, achieving\ncontrollability poses a significant challenge when applying it to virtual\ntry-on tasks and multiple denoising iterations limit its potential for\nreal-time applications. In this paper, we propose Controllable Accelerated\nvirtual Try-on with Diffusion Model called CAT-DM. To enhance the\ncontrollability, a basic diffusion-based virtual try-on network is designed,\nwhich utilizes ControlNet to introduce additional control conditions and\nimproves the feature extraction of garment images. In terms of acceleration,\nCAT-DM initiates a reverse denoising process with an implicit distribution\ngenerated by a pre-trained GAN-based model. Compared with previous try-on\nmethods based on diffusion models, CAT-DM not only retains the pattern and\ntexture details of the in-shop garment but also reduces the sampling steps\nwithout compromising generation quality. Extensive experiments demonstrate the\nsuperiority of CAT-DM against both GAN-based and diffusion-based methods in\nproducing more realistic images and accurately reproducing garment patterns.\nOur code and models will be publicly released.\n","authors":["Jianhao Zeng","Dan Song","Weizhi Nie","Hongshuo Tian","Tongtong Wang","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18403v1","updated":"2023-11-30T09:55:46Z","published":"2023-11-30T09:55:46Z","title":"Corrupting Convolution-based Unlearnable Datasets with Pixel-based Image\n Transformations","summary":" Unlearnable datasets lead to a drastic drop in the generalization performance\nof models trained on them by introducing elaborate and imperceptible\nperturbations into clean training sets. Many existing defenses, e.g., JPEG\ncompression and adversarial training, effectively counter UDs based on\nnorm-constrained additive noise. However, a fire-new type of convolution-based\nUDs have been proposed and render existing defenses all ineffective, presenting\na greater challenge to defenders. To address this, we express the\nconvolution-based unlearnable sample as the result of multiplying a matrix by a\nclean sample in a simplified scenario, and formalize the intra-class matrix\ninconsistency as $\\Theta_{imi}$, inter-class matrix consistency as\n$\\Theta_{imc}$ to investigate the working mechanism of the convolution-based\nUDs. We conjecture that increasing both of these metrics will mitigate the\nunlearnability effect. Through validation experiments that commendably support\nour hypothesis, we further design a random matrix to boost both $\\Theta_{imi}$\nand $\\Theta_{imc}$, achieving a notable degree of defense effect. Hence, by\nbuilding upon and extending these facts, we first propose a brand-new image\nCOrruption that employs randomly multiplicative transformation via\nINterpolation operation to successfully defend against convolution-based UDs.\nOur approach leverages global pixel random interpolations, effectively\nsuppressing the impact of multiplicative noise in convolution-based UDs.\nAdditionally, we have also designed two new forms of convolution-based UDs, and\nfind that our defense is the most effective against them.\n","authors":["Xianlong Wang","Shengshan Hu","Minghui Li","Zhifei Yu","Ziqi Zhou","Leo Yu Zhang","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2311.18403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18402v1","updated":"2023-11-30T09:51:53Z","published":"2023-11-30T09:51:53Z","title":"MV-CLIP: Multi-View CLIP for Zero-shot 3D Shape Recognition","summary":" Large-scale pre-trained models have demonstrated impressive performance in\nvision and language tasks within open-world scenarios. Due to the lack of\ncomparable pre-trained models for 3D shapes, recent methods utilize\nlanguage-image pre-training to realize zero-shot 3D shape recognition. However,\ndue to the modality gap, pretrained language-image models are not confident\nenough in the generalization to 3D shape recognition. Consequently, this paper\naims to improve the confidence with view selection and hierarchical prompts.\nLeveraging the CLIP model as an example, we employ view selection on the vision\nside by identifying views with high prediction confidence from multiple\nrendered views of a 3D shape. On the textual side, the strategy of hierarchical\nprompts is proposed for the first time. The first layer prompts several\nclassification candidates with traditional class-level descriptions, while the\nsecond layer refines the prediction based on function-level descriptions or\nfurther distinctions between the candidates. Remarkably, without the need for\nadditional training, our proposed method achieves impressive zero-shot 3D\nclassification accuracies of 84.44\\%, 91.51\\%, and 66.17\\% on ModelNet40,\nModelNet10, and ShapeNet Core55, respectively. Furthermore, we will make the\ncode publicly available to facilitate reproducibility and further research in\nthis area.\n","authors":["Dan Song","Xinwei Fu","Weizhi Nie","Wenhui Li","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18398v1","updated":"2023-11-30T09:49:16Z","published":"2023-11-30T09:49:16Z","title":"RainAI -- Precipitation Nowcasting from Satellite Data","summary":" This paper presents a solution to the Weather4Cast 2023 competition, where\nthe goal is to forecast high-resolution precipitation with an 8-hour lead time\nusing lower-resolution satellite radiance images. We propose a simple, yet\neffective method for spatiotemporal feature learning using a 2D U-Net model,\nthat outperforms the official 3D U-Net baseline in both performance and\nefficiency. We place emphasis on refining the dataset, through importance\nsampling and dataset preparation, and show that such techniques have a\nsignificant impact on performance. We further study an alternative\ncross-entropy loss function that improves performance over the standard mean\nsquared error loss, while also enabling models to produce probabilistic\noutputs. Additional techniques are explored regarding the generation of\npredictions at different lead times, specifically through Conditioning Lead\nTime. Lastly, to generate high-resolution forecasts, we evaluate standard and\nlearned upsampling methods. The code and trained parameters are available at\nhttps://github.com/rafapablos/w4c23-rainai.\n","authors":["Rafael Pablos Sarabia","Joachim Nyborg","Morten Birk","Ira Assent"],"pdf_url":"https://arxiv.org/pdf/2311.18398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.07711v3","updated":"2023-11-30T09:45:28Z","published":"2022-08-16T12:20:40Z","title":"Local Low-light Image Enhancement via Region-Aware Normalization","summary":" In the realm of Low-Light Image Enhancement (LLIE), existing research\nprimarily focuses on enhancing images globally. However, many applications\nrequire local LLIE, where users are allowed to illuminate specific regions\nusing an input mask, such as creating a protagonist stage or spotlight effect.\nHowever, this task has received limited attention currently. This paper aims to\nsystematically define the requirements of local LLIE and proposes a novel\nstrategy to convert current existing global LLIE methods into local versions.\nThe image space is divided into three regions: Masked Area A be enlightened to\nachieve the desired lighting effects; Transition Area B is a smooth transition\nfrom the enlightened area (Area A) to the unchanged region (Area C). To achieve\nthe task of local LLIE, we introduce Region-Aware Normalization for Local\nEnhancement, dubbed as RANLEN. RANLEN uses a dynamically designed mask-based\nnormalization operation, which enhances an image in a spatially varying manner,\nensuring that the enhancement results are consistent with the requirements\nspecified by the input mask. Additionally, a set of region-aware loss terms is\nformulated to facilitate the learning of the local LLIE framework. Our strategy\ncan be applied to existing global LLIE networks with varying structures.\nExtensive experiments demonstrate that our approach can produce the desired\nlighting effects compared to global LLIE, all the while offering controllable\nlocal enhancement with various mask shapes.\n","authors":["Shihurong Yao","Yizhan Huang","Xiaogang Xu"],"pdf_url":"https://arxiv.org/pdf/2208.07711v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16703v2","updated":"2023-11-30T09:35:09Z","published":"2023-11-28T11:27:48Z","title":"CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD\n Programs","summary":" CAD programs are a popular way to compactly encode shapes as a sequence of\noperations that are easy to parametrically modify. However, without sufficient\nsemantic comments and structure, such programs can be challenging to\nunderstand, let alone modify. We introduce the problem of semantic commenting\nCAD programs, wherein the goal is to segment the input program into code blocks\ncorresponding to semantically meaningful shape parts and assign a semantic\nlabel to each block. We solve the problem by combining program parsing with\nvisual-semantic analysis afforded by recent advances in foundational language\nand vision models. Specifically, by executing the input programs, we create\nshapes, which we use to generate conditional photorealistic images to make use\nof semantic annotators for such images. We then distill the information across\nthe images and link back to the original programs to semantically comment on\nthem. Additionally, we collected and annotated a benchmark dataset, CADTalk,\nconsisting of 5,280 machine-made programs and 45 human-made programs with\nground truth semantic comments to foster future research. We extensively\nevaluated our approach, compared to a GPT-based baseline approach, and an\nopen-set shape segmentation baseline, i.e., PartSLIP, and reported an 83.24%\naccuracy on the new CADTalk dataset. Project page:\nhttps://enigma-li.github.io/CADTalk/.\n","authors":["Haocheng Yuan","Jing Xu","Hao Pan","Adrien Bousseau","Niloy Mitra","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2311.16703v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09809v2","updated":"2023-11-30T09:31:59Z","published":"2023-09-18T14:28:47Z","title":"A Continual Learning Paradigm for Non-differentiable Visual Programming\n Frameworks on Visual Reasoning Tasks","summary":" Recently, the visual programming framework (VisProg) has emerged as a\nsignificant framework for executing compositional visual tasks due to its\ninterpretability and flexibility. However, the performance of VisProg on\nspecific Visual Reasoning (VR) tasks is markedly inferior compared to\nwell-trained task-specific models since its employed visual sub-modules have\nlimited generalization capabilities. Due to the non-differentiability of\nVisProg, it is quite challenging to improve these visual sub-modules within\nVisProg for the specific VR task while maintaining their generalizability on\nthe un-seen tasks. Attempt to overcome these difficulties, we propose CLVP, a\nContinuous Learning paradigm for VisProg across various visual reasoning tasks.\nSpecifically, our CLVP distills the capabilities of well-trained task-specific\nmodels into the visual sub-modules in a stepwise and anti-forgetting manner.\nThis can continually improve the performance of VisProg on multiple visual\ntasks while preserving the flexibility of VisProg. Extensive and comprehensive\nexperimental results demonstrate that our CLVP obtains significant performance\ngains on specific VR benchmarks, i.e., GQA (+1.4%) and NLVRv2 (+5.6%), compared\nto the VisProg baseline, and also maintains a promising generalizability for VR\non un-seen and previous learned tasks.\n","authors":["Wentao Wan","Nan Kang","Zeqing Wang","Zhuojie Yang","Liang Lin","Keze Wang"],"pdf_url":"https://arxiv.org/pdf/2309.09809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17002v2","updated":"2023-11-30T09:30:19Z","published":"2023-11-28T17:57:44Z","title":"Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following","summary":" Existing text-to-image (T2I) diffusion models usually struggle in\ninterpreting complex prompts, especially those with quantity, object-attribute\nbinding, and multi-subject descriptions. In this work, we introduce a semantic\npanel as the middleware in decoding texts to images, supporting the generator\nto better follow instructions. The panel is obtained through arranging the\nvisual concepts parsed from the input text by the aid of large language models,\nand then injected into the denoising network as a detailed control signal to\ncomplement the text condition. To facilitate text-to-panel learning, we come up\nwith a carefully designed semantic formatting protocol, accompanied by a\nfully-automatic data preparation pipeline. Thanks to such a design, our\napproach, which we call Ranni, manages to enhance a pre-trained T2I generator\nregarding its textual controllability. More importantly, the introduction of\nthe generative middleware brings a more convenient form of interaction (i.e.,\ndirectly adjusting the elements in the panel or using language instructions)\nand further allows users to finely customize their generation, based on which\nwe develop a practical system and showcase its potential in continuous\ngeneration and chatting-based editing. Our project page is at\nhttps://ranni-t2i.github.io/Ranni.\n","authors":["Yutong Feng","Biao Gong","Di Chen","Yujun Shen","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.17002v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18387v1","updated":"2023-11-30T09:30:15Z","published":"2023-11-30T09:30:15Z","title":"On Exact Inversion of DPM-Solvers","summary":" Diffusion probabilistic models (DPMs) are a key component in modern\ngenerative models. DPM-solvers have achieved reduced latency and enhanced\nquality significantly, but have posed challenges to find the exact inverse\n(i.e., finding the initial noise from the given image). Here we investigate the\nexact inversions for DPM-solvers and propose algorithms to perform them when\nsamples are generated by the first-order as well as higher-order DPM-solvers.\nFor each explicit denoising step in DPM-solvers, we formulated the inversions\nusing implicit methods such as gradient descent or forward step method to\nensure the robustness to large classifier-free guidance unlike the prior\napproach using fixed-point iteration. Experimental results demonstrated that\nour proposed exact inversion methods significantly reduced the error of both\nimage and noise reconstructions, greatly enhanced the ability to distinguish\ninvisible watermarks and well prevented unintended background changes\nconsistently during image editing. Project page:\n\\url{https://smhongok.github.io/inv-dpm.html}.\n","authors":["Seongmin Hong","Kyeonghyun Lee","Suh Yoon Jeon","Hyewon Bae","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2311.18387v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2311.18373v1","updated":"2023-11-30T09:14:37Z","published":"2023-11-30T09:14:37Z","title":"A Survey on Deep Learning for Polyp Segmentation: Techniques, Challenges\n and Future Trends","summary":" Early detection and assessment of polyps play a crucial role in the\nprevention and treatment of colorectal cancer (CRC). Polyp segmentation\nprovides an effective solution to assist clinicians in accurately locating and\nsegmenting polyp regions. In the past, people often relied on manually\nextracted lower-level features such as color, texture, and shape, which often\nhad issues capturing global context and lacked robustness to complex scenarios.\nWith the advent of deep learning, more and more outstanding medical image\nsegmentation algorithms based on deep learning networks have emerged, making\nsignificant progress in this field. This paper provides a comprehensive review\nof polyp segmentation algorithms. We first review some traditional algorithms\nbased on manually extracted features and deep segmentation algorithms, then\ndetail benchmark datasets related to the topic. Specifically, we carry out a\ncomprehensive evaluation of recent deep learning models and results based on\npolyp sizes, considering the pain points of research topics and differences in\nnetwork structures. Finally, we discuss the challenges of polyp segmentation\nand future trends in this field. The models, benchmark datasets, and source\ncode links we collected are all published at\nhttps://github.com/taozh2017/Awesome-Polyp-Segmentation.\n","authors":["Jiaxin Mei","Tao Zhou","Kaiwen Huang","Yizhe Zhang","Yi Zhou","Ye Wu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2311.18373v1.pdf","comment":"15 pages, 7 figures, and"},{"id":"http://arxiv.org/abs/2311.18363v1","updated":"2023-11-30T09:03:47Z","published":"2023-11-30T09:03:47Z","title":"Each Test Image Deserves A Specific Prompt: Continual Test-Time\n Adaptation for 2D Medical Image Segmentation","summary":" Distribution shift widely exists in medical images acquired from different\nmedical centres and poses a significant obstacle to deploying the pre-trained\nsemantic segmentation model in real-world applications. Test-time adaptation\nhas proven its effectiveness in tackling the cross-domain distribution shift\nduring inference. However, most existing methods achieve adaptation by updating\nthe pre-trained models, rendering them susceptible to error accumulation and\ncatastrophic forgetting when encountering a series of distribution shifts\n(i.e., under the continual test-time adaptation setup). To overcome these\nchallenges caused by updating the models, in this paper, we freeze the\npre-trained model and propose the Visual Prompt-based Test-Time Adaptation\n(VPTTA) method to train a specific prompt for each test image to align the\nstatistics in the batch normalization layers. Specifically, we present the\nlow-frequency prompt, which is lightweight with only a few parameters and can\nbe effectively trained in a single iteration. To enhance prompt initialization,\nwe equip VPTTA with a memory bank to benefit the current prompt from previous\nones. Additionally, we design a warm-up mechanism, which mixes source and\ntarget statistics to construct warm-up statistics, thereby facilitating the\ntraining process. Extensive experiments demonstrate the superiority of our\nVPTTA over other state-of-the-art methods on two medical image segmentation\nbenchmark tasks. The code and weights of pre-trained source models are\navailable at https://github.com/Chen-Ziyang/VPTTA.\n","authors":["Ziyang Chen","Yiwen Ye","Mengkang Lu","Yongsheng Pan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2311.18363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18361v1","updated":"2023-11-30T09:02:02Z","published":"2023-11-30T09:02:02Z","title":"Automating lookahead planning using site appearance and space\n utilization","summary":" This study proposes a method to automate the development of lookahead\nplanning. The proposed method uses construction material conditions (i.e.,\nappearances) and site space utilization to predict task completion rates. A\nGated Recurrent Unit (GRU) based Recurrent Neural Network (RNN) model was\ntrained using a segment of a construction project timeline to estimate\ncompletion rates of tasks and propose data-aware lookahead plans. The proposed\nmethod was evaluated in a sample construction project involving finishing works\nsuch as plastering, painting, and installing electrical fixtures. The results\nshow that the proposed method can assist with developing automated lookahead\nplans. In doing so, this study links construction planning with actual events\nat the construction site. It extends the traditional scheduling techniques and\nintegrates a broader spectrum of site spatial constraints into lookahead\nplanning.\n","authors":["Eyob Mengiste","Borja Garcia de Soto","Timo Hartmann"],"pdf_url":"https://arxiv.org/pdf/2311.18361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18358v1","updated":"2023-11-30T09:00:44Z","published":"2023-11-30T09:00:44Z","title":"TIDE: Test Time Few Shot Object Detection","summary":" Few-shot object detection (FSOD) aims to extract semantic knowledge from\nlimited object instances of novel categories within a target domain. Recent\nadvances in FSOD focus on fine-tuning the base model based on a few objects via\nmeta-learning or data augmentation. Despite their success, the majority of them\nare grounded with parametric readjustment to generalize on novel objects, which\nface considerable challenges in Industry 5.0, such as (i) a certain amount of\nfine-tuning time is required, and (ii) the parameters of the constructed model\nbeing unavailable due to the privilege protection, making the fine-tuning fail.\nSuch constraints naturally limit its application in scenarios with real-time\nconfiguration requirements or within black-box settings. To tackle the\nchallenges mentioned above, we formalize a novel FSOD task, referred to as Test\nTIme Few Shot DEtection (TIDE), where the model is un-tuned in the\nconfiguration procedure. To that end, we introduce an asymmetric architecture\nfor learning a support-instance-guided dynamic category classifier. Further, a\ncross-attention module and a multi-scale resizer are provided to enhance the\nmodel performance. Experimental results on multiple few-shot object detection\nplatforms reveal that the proposed TIDE significantly outperforms existing\ncontemporary methods. The implementation codes are available at\nhttps://github.com/deku-0621/TIDE\n","authors":["Weikai Li","Hongfeng Wei","Yanlai Wu","Jie Yang","Yudi Ruan","Yuan Li","Ying Tang"],"pdf_url":"https://arxiv.org/pdf/2311.18358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13172v3","updated":"2023-11-30T08:55:24Z","published":"2023-05-22T16:00:00Z","title":"Editing Large Language Models: Problems, Methods, and Opportunities","summary":" Despite the ability to train capable LLMs, the methodology for maintaining\ntheir relevancy and rectifying errors remains elusive. To this end, the past\nfew years have witnessed a surge in techniques for editing LLMs, the objective\nof which is to efficiently alter the behavior of LLMs within a specific domain\nwithout negatively impacting performance across other inputs. This paper\nembarks on a deep exploration of the problems, methods, and opportunities\nrelated to model editing for LLMs. In particular, we provide an exhaustive\noverview of the task definition and challenges associated with model editing,\nalong with an in-depth empirical analysis of the most progressive methods\ncurrently at our disposal. We also build a new benchmark dataset to facilitate\na more robust evaluation and pinpoint enduring issues intrinsic to existing\ntechniques. Our objective is to provide valuable insights into the\neffectiveness and feasibility of each editing technique, thereby assisting the\ncommunity in making informed decisions on the selection of the most appropriate\nmethod for a specific task or context. Code and datasets are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Yunzhi Yao","Peng Wang","Bozhong Tian","Siyuan Cheng","Zhoubo Li","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13172v3.pdf","comment":"EMNLP 2023. Updated with new experiments"},{"id":"http://arxiv.org/abs/2311.17618v2","updated":"2023-11-30T08:46:05Z","published":"2023-11-29T13:26:29Z","title":"ShapeGPT: 3D Shape Generation with A Unified Multi-modal Language Model","summary":" The advent of large language models, enabling flexibility through\ninstruction-driven approaches, has revolutionized many traditional generative\ntasks, but large models for 3D data, particularly in comprehensively handling\n3D shapes with other modalities, are still under-explored. By achieving\ninstruction-based shape generations, versatile multimodal generative shape\nmodels can significantly benefit various fields like 3D virtual construction\nand network-aided design. In this work, we present ShapeGPT, a shape-included\nmulti-modal framework to leverage strong pre-trained language models to address\nmultiple shape-relevant tasks. Specifically, ShapeGPT employs a\nword-sentence-paragraph framework to discretize continuous shapes into shape\nwords, further assembles these words for shape sentences, as well as integrates\nshape with instructional text for multi-modal paragraphs. To learn this\nshape-language model, we use a three-stage training scheme, including shape\nrepresentation, multimodal alignment, and instruction-based generation, to\nalign shape-language codebooks and learn the intricate correlations among these\nmodalities. Extensive experiments demonstrate that ShapeGPT achieves comparable\nperformance across shape-relevant tasks, including text-to-shape,\nshape-to-text, shape completion, and shape editing.\n","authors":["Fukun Yin","Xin Chen","Chi Zhang","Biao Jiang","Zibo Zhao","Jiayuan Fan","Gang Yu","Taihao Li","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17618v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18344v1","updated":"2023-11-30T08:31:49Z","published":"2023-11-30T08:31:49Z","title":"DSeg: Direct Line Segments Detection","summary":" This paper presents a model-driven approach to detect image line segments.\nThe approach incrementally detects segments on the gradient image using a\nlinear Kalman filter that estimates the supporting line parameters and their\nassociated variances. The algorithm is fast and robust with respect to image\nnoise and illumination variations, it allows the detection of longer line\nsegments than data-driven approaches, and does not require any tedious\nparameters tuning. An extension of the algorithm that exploits a pyramidal\napproach to enhance the quality of results is proposed. Results with varying\nscene illumination and comparisons to classic existing approaches are\npresented.\n","authors":["Berger Cyrille","Lacroix Simon"],"pdf_url":"https://arxiv.org/pdf/2311.18344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11810v3","updated":"2023-11-30T08:27:38Z","published":"2023-11-20T14:42:25Z","title":"DocPedia: Unleashing the Power of Large Multimodal Model in the\n Frequency Domain for Versatile Document Understanding","summary":" This work presents DocPedia, a novel large multimodal model (LMM) for\nversatile OCR-free document understanding, capable of parsing images up to\n2,560$\\times$2,560 resolution. Unlike existing work either struggle with\nhigh-resolution documents or give up the large language model thus vision or\nlanguage ability constrained, our DocPedia directly processes visual input in\nthe frequency domain rather than the pixel space. The unique characteristic\nenables DocPedia to capture a greater amount of visual and textual information\nusing a limited number of visual tokens. To consistently enhance both\nperception and comprehension abilities of our model, we develop a dual-stage\ntraining strategy and enrich instructions/annotations of all training tasks\ncovering multiple document types. Extensive quantitative and qualitative\nexperiments conducted on various publicly available benchmarks confirm the\nmutual benefits of jointly learning perception and comprehension tasks. The\nresults provide further evidence of the effectiveness and superior performance\nof our DocPedia over other methods.\n","authors":["Hao Feng","Qi Liu","Hao Liu","Wengang Zhou","Houqiang Li","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2311.11810v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16926v2","updated":"2023-11-30T08:20:02Z","published":"2023-11-28T16:31:27Z","title":"LLaFS: When Large-Language Models Meet Few-Shot Segmentation","summary":" This paper proposes LLaFS, the first attempt to leverage large language\nmodels (LLMs) in few-shot segmentation. In contrast to the conventional\nfew-shot segmentation methods that only rely on the limited and biased\ninformation from the annotated support images, LLaFS leverages the vast prior\nknowledge gained by LLM as an effective supplement and directly uses the LLM to\nsegment images in a few-shot manner. To enable the text-based LLM to handle\nimage-related tasks, we carefully design an input instruction that allows the\nLLM to produce segmentation results represented as polygons, and propose a\nregion-attribute table to simulate the human visual mechanism and provide\nmulti-modal guidance. We also synthesize pseudo samples and use curriculum\nlearning for pretraining to augment data and achieve better optimization. LLaFS\nachieves state-of-the-art results on multiple datasets, showing the potential\nof using LLMs for few-shot computer vision tasks. Code will be available at\nhttps://github.com/lanyunzhu99/LLaFS.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18332v1","updated":"2023-11-30T08:03:53Z","published":"2023-11-30T08:03:53Z","title":"Multilevel Saliency-Guided Self-Supervised Learning for Image Anomaly\n Detection","summary":" Anomaly detection (AD) is a fundamental task in computer vision. It aims to\nidentify incorrect image data patterns which deviate from the normal ones.\nConventional methods generally address AD by preparing augmented negative\nsamples to enforce self-supervised learning. However, these techniques\ntypically do not consider semantics during augmentation, leading to the\ngeneration of unrealistic or invalid negative samples. Consequently, the\nfeature extraction network can be hindered from embedding critical features. In\nthis study, inspired by visual attention learning approaches, we propose\nCutSwap, which leverages saliency guidance to incorporate semantic cues for\naugmentation. Specifically, we first employ LayerCAM to extract multilevel\nimage features as saliency maps and then perform clustering to obtain multiple\ncentroids. To fully exploit saliency guidance, on each map, we select a pixel\npair from the cluster with the highest centroid saliency to form a patch pair.\nSuch a patch pair includes highly similar context information with dense\nsemantic correlations. The resulting negative sample is created by swapping the\nlocations of the patch pair. Compared to prior augmentation methods, CutSwap\ngenerates more subtle yet realistic negative samples to facilitate quality\nfeature learning. Extensive experimental and ablative evaluations demonstrate\nthat our method achieves state-of-the-art AD performance on two mainstream AD\nbenchmark datasets.\n","authors":["Jianjian Qin","Chunzhi Gu","Jun Yu","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18331v1","updated":"2023-11-30T08:02:49Z","published":"2023-11-30T08:02:49Z","title":"MRFP: Learning Generalizable Semantic Segmentation from Sim-2-Real with\n Multi-Resolution Feature Perturbation","summary":" Deep neural networks have shown exemplary performance on semantic scene\nunderstanding tasks on source domains, but due to the absence of style\ndiversity during training, enhancing performance on unseen target domains using\nonly single source domain data remains a challenging task. Generation of\nsimulated data is a feasible alternative to retrieving large style-diverse\nreal-world datasets as it is a cumbersome and budget-intensive process.\nHowever, the large domain-specific inconsistencies between simulated and\nreal-world data pose a significant generalization challenge in semantic\nsegmentation. In this work, to alleviate this problem, we propose a novel\nMultiResolution Feature Perturbation (MRFP) technique to randomize\ndomain-specific fine-grained features and perturb style of coarse features. Our\nexperimental results on various urban-scene segmentation datasets clearly\nindicate that, along with the perturbation of style-information, perturbation\nof fine-feature components is paramount to learn domain invariant robust\nfeature maps for semantic segmentation models. MRFP is a simple and\ncomputationally efficient, transferable module with no additional learnable\nparameters or objective functions, that helps state-of-the-art deep neural\nnetworks to learn robust domain invariant features for simulation-to-real\nsemantic segmentation.\n","authors":["Sumanth Udupa","Prajwal Gurunath","Aniruddh Sikdar","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2311.18331v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.18328v1","updated":"2023-11-30T07:58:54Z","published":"2023-11-30T07:58:54Z","title":"Advances in 3D Neural Stylization: A Survey","summary":" Modern artificial intelligence provides a novel way of producing digital art\nin styles. The expressive power of neural networks enables the realm of visual\nstyle transfer methods, which can be used to edit images, videos, and 3D data\nto make them more artistic and diverse. This paper reports on recent advances\nin neural stylization for 3D data. We provide a taxonomy for neural stylization\nby considering several important design choices, including scene\nrepresentation, guidance data, optimization strategies, and output styles.\nBuilding on such taxonomy, our survey first revisits the background of neural\nstylization on 2D images, and then provides in-depth discussions on recent\nneural stylization methods for 3D data, where we also provide a mini-benchmark\non artistic stylization methods. Based on the insights gained from the survey,\nwe then discuss open challenges, future research, and potential applications\nand impacts of neural stylization.\n","authors":["Yingshu Chen","Guocheng Shao","Ka Chun Shum","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.18328v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2311.17460v2","updated":"2023-11-30T07:58:17Z","published":"2023-11-29T09:02:07Z","title":"W-HMR: Human Mesh Recovery in World Space with Weak-supervised Camera\n Calibration and Orientation Correction","summary":" For a long time, in the field of reconstructing 3D human bodies from\nmonocular images, most methods opted to simplify the task by minimizing the\ninfluence of the camera. Using a coarse focal length setting results in the\nreconstructed bodies not aligning well with distorted images. Ignoring camera\nrotation leads to an unrealistic reconstructed body pose in world space.\nConsequently, existing methods' application scenarios are confined to\ncontrolled environments. And they struggle to achieve accurate and reasonable\nreconstruction in world space when confronted with complex and diverse\nin-the-wild images. To address the above issues, we propose W-HMR, which\ndecouples global body recovery into camera calibration, local body recovery and\nglobal body orientation correction. We design the first weak-supervised camera\ncalibration method for body distortion, eliminating dependence on focal length\nlabels and achieving finer mesh-image alignment. We propose a novel orientation\ncorrection module to allow the reconstructed human body to remain normal in\nworld space. Decoupling body orientation and body pose enables our model to\nconsider the accuracy in camera coordinate and the reasonableness in world\ncoordinate simultaneously, expanding the range of applications. As a result,\nW-HMR achieves high-quality reconstruction in dual coordinate systems,\nparticularly in challenging scenes. Codes will be released on\nhttps://yw0208.github.io/w-hmr/ after publication.\n","authors":["Wei Yao","Hongwen Zhang","Yunlian Sun","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2311.17460v2.pdf","comment":"Project Page: https://yw0208.github.io/w-hmr/"},{"id":"http://arxiv.org/abs/2211.11293v2","updated":"2023-11-30T07:56:19Z","published":"2022-11-21T09:34:07Z","title":"Beyond the Field-of-View: Enhancing Scene Visibility and Perception with\n Clip-Recurrent Transformer","summary":" Vision sensors are widely applied in vehicles, robots, and roadside\ninfrastructure. However, due to limitations in hardware cost and system size,\ncamera Field-of-View (FoV) is often restricted and may not provide sufficient\ncoverage. Nevertheless, from a spatiotemporal perspective, it is possible to\nobtain information beyond the camera's physical FoV from past video streams. In\nthis paper, we propose the concept of online video inpainting for autonomous\nvehicles to expand the field of view, thereby enhancing scene visibility,\nperception, and system safety. To achieve this, we introduce the FlowLens\narchitecture, which explicitly employs optical flow and implicitly incorporates\na novel clip-recurrent transformer for feature propagation. FlowLens offers two\nkey features: 1) FlowLens includes a newly designed Clip-Recurrent Hub with\n3D-Decoupled Cross Attention (DDCA) to progressively process global information\naccumulated over time. 2) It integrates a multi-branch Mix Fusion Feed Forward\nNetwork (MixF3N) to enhance the precise spatial flow of local features. To\nfacilitate training and evaluation, we derive the KITTI360 dataset with various\nFoV mask, which covers both outer- and inner FoV expansion scenarios. We also\nconduct quantitative assessments of beyond-FoV semantics across different\nmodels and perform qualitative comparisons of beyond-FoV object detection. We\nillustrate that employing FlowLens to reconstruct unseen scenes even enhances\nperception within the field of view by providing reliable semantic context.\nExtensive experiments and user studies involving offline and online video\ninpainting, as well as beyond-FoV perception tasks, demonstrate that FlowLens\nachieves state-of-the-art performance. The source code and dataset are made\npublicly available at https://github.com/MasterHow/FlowLens.\n","authors":["Hao Shi","Qi Jiang","Kailun Yang","Xiaoting Yin","Huajian Ni","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2211.11293v2.pdf","comment":"The source code and dataset are made publicly available at\n https://github.com/MasterHow/FlowLens"},{"id":"http://arxiv.org/abs/2209.09207v2","updated":"2023-11-30T07:55:36Z","published":"2022-08-31T14:20:30Z","title":"Table Detection in the Wild: A Novel Diverse Table Detection Dataset and\n Method","summary":" Recent deep learning approaches in table detection achieved outstanding\nperformance and proved to be effective in identifying document layouts.\nCurrently, available table detection benchmarks have many limitations,\nincluding the lack of samples diversity, simple table structure, the lack of\ntraining cases, and samples quality. In this paper, we introduce a diverse\nlarge-scale dataset for table detection with more than seven thousand samples\ncontaining a wide variety of table structures collected from many diverse\nsources. In addition to that, we also present baseline results using a\nconvolutional neural network-based method to detect table structure in\ndocuments. Experimental results show the superiority of applying convolutional\ndeep learning methods over classical computer vision-based methods. The\nintroduction of this diverse table detection dataset will enable the community\nto develop high throughput deep learning methods for understanding document\nlayout and tabular data processing. Dataset is available at: 1.\nhttps://www.kaggle.com/datasets/mrinalim/stdw-dataset 2.\nhttps://huggingface.co/datasets/n3011/STDW\n","authors":["Mrinal Haloi","Shashank Shekhar","Nikhil Fande","Siddhant Swaroop Dash","Sanjay G"],"pdf_url":"https://arxiv.org/pdf/2209.09207v2.pdf","comment":"Open source Table detection dataset and baseline results"},{"id":"http://arxiv.org/abs/2309.16483v2","updated":"2023-11-30T07:47:46Z","published":"2023-09-28T14:45:54Z","title":"Rethinking Domain Generalization: Discriminability and Generalizability","summary":" Domain generalization (DG) endeavors to develop robust models that possess\nstrong generalizability while preserving excellent discriminability.\nNonetheless, pivotal DG techniques tend to improve the feature generalizability\nby learning domain-invariant representations, inadvertently overlooking the\nfeature discriminability. On the one hand, the simultaneous attainment of\ngeneralizability and discriminability of features presents a complex challenge,\noften entailing inherent contradictions. This challenge becomes particularly\npronounced when domain-invariant features manifest reduced discriminability\nowing to the inclusion of unstable factors, \\emph{i.e.,} spurious correlations.\nOn the other hand, prevailing domain-invariant methods can be categorized as\ncategory-level alignment, susceptible to discarding indispensable features\npossessing substantial generalizability and narrowing intra-class variations.\nTo surmount these obstacles, we rethink DG from a new perspective that\nconcurrently imbues features with formidable discriminability and robust\ngeneralizability, and present a novel framework, namely, Discriminative\nMicroscopic Distribution Alignment (DMDA). DMDA incorporates two core\ncomponents: Selective Channel Pruning~(SCP) and Micro-level Distribution\nAlignment (MDA). Concretely, SCP attempts to curtail redundancy within neural\nnetworks, prioritizing stable attributes conducive to accurate classification.\nThis approach alleviates the adverse effect of spurious domain invariance and\namplifies the feature discriminability. Besides, MDA accentuates micro-level\nalignment within each class, going beyond mere category-level alignment. This\nstrategy accommodates sufficient generalizable features and facilitates\nwithin-class variations. Extensive experiments on four benchmark datasets\ncorroborate the efficacy of our method.\n","authors":["Shaocong Long","Qianyu Zhou","Chenhao Ying","Lizhuang Ma","Yuan Luo"],"pdf_url":"https://arxiv.org/pdf/2309.16483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05846v2","updated":"2023-11-30T07:42:04Z","published":"2023-06-09T12:18:48Z","title":"Motion-DVAE: Unsupervised learning for fast human motion denoising","summary":" Pose and motion priors are crucial for recovering realistic and accurate\nhuman motion from noisy observations. Substantial progress has been made on\npose and shape estimation from images, and recent works showed impressive\nresults using priors to refine frame-wise predictions. However, a lot of motion\npriors only model transitions between consecutive poses and are used in\ntime-consuming optimization procedures, which is problematic for many\napplications requiring real-time motion capture. We introduce Motion-DVAE, a\nmotion prior to capture the short-term dependencies of human motion. As part of\nthe dynamical variational autoencoder (DVAE) models family, Motion-DVAE\ncombines the generative capability of VAE models and the temporal modeling of\nrecurrent architectures. Together with Motion-DVAE, we introduce an\nunsupervised learned denoising method unifying regression- and\noptimization-based approaches in a single framework for real-time 3D human pose\nestimation. Experiments show that the proposed approach reaches competitive\nperformance with state-of-the-art methods while being much faster.\n","authors":["Guénolé Fiche","Simon Leglaive","Xavier Alameda-Pineda","Renaud Séguier"],"pdf_url":"https://arxiv.org/pdf/2306.05846v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18311v1","updated":"2023-11-30T07:29:30Z","published":"2023-11-30T07:29:30Z","title":"Anisotropic Neural Representation Learning for High-Quality Neural\n Rendering","summary":" Neural radiance fields (NeRFs) have achieved impressive view synthesis\nresults by learning an implicit volumetric representation from multi-view\nimages. To project the implicit representation into an image, NeRF employs\nvolume rendering that approximates the continuous integrals of rays as an\naccumulation of the colors and densities of the sampled points. Although this\napproximation enables efficient rendering, it ignores the direction information\nin point intervals, resulting in ambiguous features and limited reconstruction\nquality. In this paper, we propose an anisotropic neural representation\nlearning method that utilizes learnable view-dependent features to improve\nscene representation and reconstruction. We model the volumetric function as\nspherical harmonic (SH)-guided anisotropic features, parameterized by\nmultilayer perceptrons, facilitating ambiguity elimination while preserving the\nrendering efficiency. To achieve robust scene reconstruction without anisotropy\noverfitting, we regularize the energy of the anisotropic features during\ntraining. Our method is flexiable and can be plugged into NeRF-based\nframeworks. Extensive experiments show that the proposed representation can\nboost the rendering quality of various NeRFs and achieve state-of-the-art\nrendering performance on both synthetic and real-world scenes.\n","authors":["Y. Wang","J. Xu","Y. Zeng","Y. Gong"],"pdf_url":"https://arxiv.org/pdf/2311.18311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18307v1","updated":"2023-11-30T07:25:24Z","published":"2023-11-30T07:25:24Z","title":"Categorical Traffic Transformer: Interpretable and Diverse Behavior\n Prediction with Tokenized Latent","summary":" Adept traffic models are critical to both planning and closed-loop simulation\nfor autonomous vehicles (AV), and key design objectives include accuracy,\ndiverse multimodal behaviors, interpretability, and downstream compatibility.\nRecently, with the advent of large language models (LLMs), an additional\ndesirable feature for traffic models is LLM compatibility. We present\nCategorical Traffic Transformer (CTT), a traffic model that outputs both\ncontinuous trajectory predictions and tokenized categorical predictions (lane\nmodes, homotopies, etc.). The most outstanding feature of CTT is its fully\ninterpretable latent space, which enables direct supervision of the latent\nvariable from the ground truth during training and avoids mode collapse\ncompletely. As a result, CTT can generate diverse behaviors conditioned on\ndifferent latent modes with semantic meanings while beating SOTA on prediction\naccuracy. In addition, CTT's ability to input and output tokens enables\nintegration with LLMs for common-sense reasoning and zero-shot generalization.\n","authors":["Yuxiao Chen","Sander Tonkens","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2311.18307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07688v2","updated":"2023-11-30T07:19:34Z","published":"2023-07-15T02:42:19Z","title":"DRM-IR: Task-Adaptive Deep Unfolding Network for All-In-One Image\n Restoration","summary":" Existing All-In-One image restoration (IR) methods usually lack flexible\nmodeling on various types of degradation, thus impeding the restoration\nperformance. To achieve All-In-One IR with higher task dexterity, this work\nproposes an efficient Dynamic Reference Modeling paradigm (DRM-IR), which\nconsists of task-adaptive degradation modeling and model-based image restoring.\nSpecifically, these two subtasks are formalized as a pair of entangled\nreference-based maximum a posteriori (MAP) inferences, which are optimized\nsynchronously in an unfolding-based manner. With the two cascaded subtasks,\nDRM-IR first dynamically models the task-specific degradation based on a\nreference image pair and further restores the image with the collected\ndegradation statistics. Besides, to bridge the semantic gap between the\nreference and target degraded images, we further devise a Degradation Prior\nTransmitter (DPT) that restrains the instance-specific feature differences.\nDRM-IR explicitly provides superior flexibility for All-in-One IR while being\ninterpretable. Extensive experiments on multiple benchmark datasets show that\nour DRM-IR achieves state-of-the-art in All-In-One IR.\n","authors":["Yuanshuo Cheng","Mingwen Shao","Yecong Wan","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.07688v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18303v1","updated":"2023-11-30T07:14:00Z","published":"2023-11-30T07:14:00Z","title":"OmniMotionGPT: Animal Motion Generation with Limited Data","summary":" Our paper aims to generate diverse and realistic animal motion sequences from\ntextual descriptions, without a large-scale animal text-motion dataset. While\nthe task of text-driven human motion synthesis is already extensively studied\nand benchmarked, it remains challenging to transfer this success to other\nskeleton structures with limited data. In this work, we design a model\narchitecture that imitates Generative Pretraining Transformer (GPT), utilizing\nprior knowledge learned from human data to the animal domain. We jointly train\nmotion autoencoders for both animal and human motions and at the same time\noptimize through the similarity scores among human motion encoding, animal\nmotion encoding, and text CLIP embedding. Presenting the first solution to this\nproblem, we are able to generate animal motions with high diversity and\nfidelity, quantitatively and qualitatively outperforming the results of\ntraining human motion generation baselines on animal data. Additionally, we\nintroduce AnimalML3D, the first text-animal motion dataset with 1240 animation\nsequences spanning 36 different animal identities. We hope this dataset would\nmediate the data scarcity problem in text-driven animal motion generation,\nproviding a new playground for the research community.\n","authors":["Zhangsihao Yang","Mingyuan Zhou","Mengyi Shan","Bingbing Wen","Ziwei Xuan","Mitch Hill","Junjie Bai","Guo-Jun Qi","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18303v1.pdf","comment":"The project page is at https://zshyang.github.io/omgpt-website/"},{"id":"http://arxiv.org/abs/2304.10880v3","updated":"2023-11-30T07:10:39Z","published":"2023-04-21T10:47:13Z","title":"Med-Tuning: Parameter-Efficient Transfer Learning with Fine-Grained\n Feature Enhancement for Medical Volumetric Segmentation","summary":" Deep learning-based medical volumetric segmentation methods either train the\nmodel from scratch or follow the standard ``pre-training then fine-tuning\"\nparadigm. Although fine-tuning a pre-trained model on downstream tasks can\nharness its representation power, the standard full fine-tuning is costly in\nterms of computation and memory footprint. In this paper, we present the study\non parameter-efficient transfer learning for medical volumetric segmentation\nand propose a new framework named Med-Tuning based on intra-stage feature\nenhancement and inter-stage feature interaction. Additionally, aiming at\nexploiting the intrinsic global properties of Fourier Transform for\nparameter-efficient transfer learning, a new adapter block namely Med-Adapter\nwith a well-designed Fourier Transform branch is proposed for effectively and\nefficiently modeling the crucial global context for medical volumetric\nsegmentation. Given a large-scale pre-trained model on 2D natural images, our\nmethod can exploit both the crucial spatial multi-scale feature and volumetric\ncorrelations along slices for accurate segmentation. Extensive experiments on\nthree benchmark datasets (including CT and MRI) show that our method can\nachieve better results than previous parameter-efficient transfer learning\nmethods on segmentation tasks, with much less tuned parameter costs. Compared\nto full fine-tuning, our method reduces the fine-tuned model parameters by up\nto 4x, with even better segmentation performance. The code will be made\npublicly available at https://github.com/jessie-chen99/Med-Tuning.\n","authors":["Wenxuan Wang","Jiachen Shen","Chen Chen","Jianbo Jiao","Jing Liu","Yan Zhang","Shanshan Song","Jiangyun Li"],"pdf_url":"https://arxiv.org/pdf/2304.10880v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18299v1","updated":"2023-11-30T07:09:46Z","published":"2023-11-30T07:09:46Z","title":"Reconstructing the normal and shape at specularities in endoscopy","summary":" Specularities are numerous in endoscopic images. They occur as many white\nsmall elliptic spots, which are generally ruled out as nuisance in image\nanalysis and computer vision methods. Instead, we propose to use specularities\nas cues for 3D perception. Specifically, we propose a new method to\nreconstruct, at each specularity, the observed tissue's normal direction (i.e.,\nits orientation) and shape (i.e., its curvature) from a single image. We show\nresults on simulated and real interventional images.\n","authors":["Karim Makki","Adrien Bartoli"],"pdf_url":"https://arxiv.org/pdf/2311.18299v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.14837v2","updated":"2023-11-30T18:14:48Z","published":"2023-11-24T20:16:38Z","title":"Benchmarking Robustness of Text-Image Composed Retrieval","summary":" Text-image composed retrieval aims to retrieve the target image through the\ncomposed query, which is specified in the form of an image plus some text that\ndescribes desired modifications to the input image. It has recently attracted\nattention due to its ability to leverage both information-rich images and\nconcise language to precisely express the requirements for target images.\nHowever, the robustness of these approaches against real-world corruptions or\nfurther text understanding has never been studied. In this paper, we perform\nthe first robustness study and establish three new diversified benchmarks for\nsystematic analysis of text-image composed retrieval against natural\ncorruptions in both vision and text and further probe textural understanding.\nFor natural corruption analysis, we introduce two new large-scale benchmark\ndatasets, CIRR-C and FashionIQ-C for testing in open domain and fashion domain\nrespectively, both of which apply 15 visual corruptions and 7 textural\ncorruptions. For textural understanding analysis, we introduce a new diagnostic\ndataset CIRR-D by expanding the original raw data with synthetic data, which\ncontains modified text to better probe textual understanding ability including\nnumerical variation, attribute variation, object removal, background variation,\nand fine-grained evaluation. The code and benchmark datasets are available at\nhttps://github.com/SunTongtongtong/Benchmark-Robustness-Text-Image-Compose-Retrieval.\n","authors":["Shitong Sun","Jindong Gu","Shaogang Gong"],"pdf_url":"https://arxiv.org/pdf/2311.14837v2.pdf","comment":"Accepted by R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot\n Learning in Foundation Models at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.18724v1","updated":"2023-11-30T17:22:55Z","published":"2023-11-30T17:22:55Z","title":"Routing-Guided Learned Product Quantization for Graph-Based Approximate\n Nearest Neighbor Search","summary":" Given a vector dataset $\\mathcal{X}$, a query vector $\\vec{x}_q$, graph-based\nApproximate Nearest Neighbor Search (ANNS) aims to build a proximity graph (PG)\nas an index of $\\mathcal{X}$ and approximately return vectors with minimum\ndistances to $\\vec{x}_q$ by searching over the PG index. It suffers from the\nlarge-scale $\\mathcal{X}$ because a PG with full vectors is too large to fit\ninto the memory, e.g., a billion-scale $\\mathcal{X}$ in 128 dimensions would\nconsume nearly 600 GB memory. To solve this, Product Quantization (PQ)\nintegrated graph-based ANNS is proposed to reduce the memory usage, using\nsmaller compact codes of quantized vectors in memory instead of the large\noriginal vectors. Existing PQ methods do not consider the important routing\nfeatures of PG, resulting in low-quality quantized vectors that affect the\nANNS's effectiveness. In this paper, we present an end-to-end Routing-guided\nlearned Product Quantization (RPQ) for graph-based ANNS. It consists of (1) a\n\\textit{differentiable quantizer} used to make the standard discrete PQ\ndifferentiable to suit for back-propagation of end-to-end learning, (2) a\n\\textit{sampling-based feature extractor} used to extract neighborhood and\nrouting features of a PG, and (3) a \\textit{multi-feature joint training\nmodule} with two types of feature-aware losses to continuously optimize the\ndifferentiable quantizer. As a result, the inherent features of a PG would be\nembedded into the learned PQ, generating high-quality quantized vectors.\nMoreover, we integrate our RPQ with the state-of-the-art DiskANN and existing\npopular PGs to improve their performance. Comprehensive experiments on\nreal-world large-scale datasets (from 1M to 1B) demonstrate RPQ's superiority,\ne.g., 1.7$\\times$-4.2$\\times$ improvement on QPS at the same recall@10 of 95\\%.\n","authors":["Qiang Yue","Xiaoliang Xu","Yuxiang Wang","Yikun Tao","Xuliyuan Luo"],"pdf_url":"https://arxiv.org/pdf/2311.18724v1.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2306.08018v4","updated":"2023-11-30T15:29:58Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a comprehensive instruction dataset\ndesigned for the biomolecular domain. Mol-Instructions encompasses three key\ncomponents: molecule-oriented instructions, protein-oriented instructions, and\nbiomolecular text instructions. Each component aims to improve the\nunderstanding and prediction capabilities of LLMs concerning biomolecular\nfeatures and behaviors. Through extensive instruction tuning experiments on\nLLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large\nmodels' performance in the intricate realm of biomolecular studies, thus\nfostering progress in the biomolecular research community. Mol-Instructions is\npublicly available for ongoing research and will undergo regular updates to\nenhance its applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v4.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions, add\n more experiments"},{"id":"http://arxiv.org/abs/2311.18604v1","updated":"2023-11-30T15:00:25Z","published":"2023-11-30T15:00:25Z","title":"Barwise Music Structure Analysis with the Correlation Block-Matching\n Segmentation Algorithm","summary":" Music Structure Analysis (MSA) is a Music Information Retrieval task\nconsisting of representing a song in a simplified, organized manner by breaking\nit down into sections typically corresponding to ``chorus'', ``verse'',\n``solo'', etc. In this work, we extend an MSA algorithm called the Correlation\nBlock-Matching (CBM) algorithm introduced by (Marmoret et al., 2020, 2022b).\nThe CBM algorithm is a dynamic programming algorithm that segments\nself-similarity matrices, which are a standard description used in MSA and in\nnumerous other applications. In this work, self-similarity matrices are\ncomputed from the feature representation of an audio signal and time is sampled\nat the bar-scale. This study examines three different standard similarity\nfunctions for the computation of self-similarity matrices. Results show that,\nin optimal conditions, the proposed algorithm achieves a level of performance\nwhich is competitive with supervised state-of-the-art methods while only\nrequiring knowledge of bar positions. In addition, the algorithm is made\nopen-source and is highly customizable.\n","authors":["Axel Marmoret","Jérémy E. Cohen","Frédéric Bimbot"],"pdf_url":"https://arxiv.org/pdf/2311.18604v1.pdf","comment":"19 pages, 13 figures, 11 tables, 1 algorithm, published in\n Transactions of the International Society for Music Information Retrieval"},{"id":"http://arxiv.org/abs/2311.18550v1","updated":"2023-11-30T13:36:21Z","published":"2023-11-30T13:36:21Z","title":"Search Still Matters: Information Retrieval in the Era of Generative AI","summary":" Objective: Information retrieval (IR, also known as search) systems are\nubiquitous in modern times. How does the emergence of generative artificial\nintelligence (AI), based on large language models (LLMs), fit into the IR\nprocess? Process: This perspective explores the use of generative AI in the\ncontext of the motivations, considerations, and outcomes of the IR process with\na focus on the academic use of such systems. Conclusions: There are many\ninformation needs, from simple to complex, that motivate use of IR. Users of\nsuch systems, particularly academics, have concerns for authoritativeness,\ntimeliness, and contextualization of search. While LLMs may provide\nfunctionality that aids the IR process, the continued need for search systems,\nand research into their improvement, remains essential.\n","authors":["William R. Hersh"],"pdf_url":"https://arxiv.org/pdf/2311.18550v1.pdf","comment":"6 pages, no figures"},{"id":"http://arxiv.org/abs/2311.18503v1","updated":"2023-11-30T12:28:43Z","published":"2023-11-30T12:28:43Z","title":"End-to-End Retrieval with Learned Dense and Sparse Representations Using\n Lucene","summary":" The bi-encoder architecture provides a framework for understanding\nmachine-learned retrieval models based on dense and sparse vector\nrepresentations. Although these representations capture parametric realizations\nof the same underlying conceptual framework, their respective implementations\nof top-$k$ similarity search require the coordination of different software\ncomponents (e.g., inverted indexes, HNSW indexes, and toolkits for neural\ninference), often knitted together in complex architectures. In this work, we\nask the following question: What's the simplest design, in terms of requiring\nthe fewest changes to existing infrastructure, that can support end-to-end\nretrieval with modern dense and sparse representations? The answer appears to\nbe that Lucene is sufficient, as we demonstrate in Anserini, a toolkit for\nreproducible information retrieval research. That is, effective retrieval with\nmodern single-vector neural models can be efficiently performed directly in\nJava on the CPU. We examine the implications of this design for information\nretrieval researchers pushing the state of the art as well as for software\nengineers building production search systems.\n","authors":["Haonan Chen","Carlos Lassance","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2311.18503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00402v5","updated":"2023-11-30T09:15:51Z","published":"2023-09-30T14:55:44Z","title":"DiskANN++: Efficient Page-based Search over Isomorphic Mapped Graph\n Index using Query-sensitivity Entry Vertex","summary":" Given a vector dataset $\\mathcal{X}$ and a query vector $\\vec{x}_q$,\ngraph-based Approximate Nearest Neighbor Search (ANNS) aims to build a graph\nindex $G$ and approximately return vectors with minimum distances to\n$\\vec{x}_q$ by searching over $G$. The main drawback of graph-based ANNS is\nthat a graph index would be too large to fit into the memory especially for a\nlarge-scale $\\mathcal{X}$. To solve this, a Product Quantization (PQ)-based\nhybrid method called DiskANN is proposed to store a low-dimensional PQ index in\nmemory and retain a graph index in SSD, thus reducing memory overhead while\nensuring a high search accuracy. However, it suffers from two I/O issues that\nsignificantly affect the overall efficiency: (1) long routing path from an\nentry vertex to the query's neighborhood that results in large number of I/O\nrequests and (2) redundant I/O requests during the routing process. We propose\nan optimized DiskANN++ to overcome above issues. Specifically, for the first\nissue, we present a query-sensitive entry vertex selection strategy to replace\nDiskANN's static graph-central entry vertex by a dynamically determined entry\nvertex that is close to the query. For the second I/O issue, we present an\nisomorphic mapping on DiskANN's graph index to optimize the SSD layout and\npropose an asynchronously optimized Pagesearch based on the optimized SSD\nlayout as an alternative to DiskANN's beamsearch. Comprehensive experimental\nstudies on eight real-world datasets demonstrate our DiskANN++'s superiority on\nefficiency. We achieve a notable 1.5 X to 2.2 X improvement on QPS compared to\nDiskANN, given the same accuracy constraint.\n","authors":["Jiongkang Ni","Xiaoliang Xu","Yuxiang Wang","Can Li","Jiajie Yao","Shihai Xiao","Xuecang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.00402v5.pdf","comment":"15 pages including references"},{"id":"http://arxiv.org/abs/2305.13172v3","updated":"2023-11-30T08:55:24Z","published":"2023-05-22T16:00:00Z","title":"Editing Large Language Models: Problems, Methods, and Opportunities","summary":" Despite the ability to train capable LLMs, the methodology for maintaining\ntheir relevancy and rectifying errors remains elusive. To this end, the past\nfew years have witnessed a surge in techniques for editing LLMs, the objective\nof which is to efficiently alter the behavior of LLMs within a specific domain\nwithout negatively impacting performance across other inputs. This paper\nembarks on a deep exploration of the problems, methods, and opportunities\nrelated to model editing for LLMs. In particular, we provide an exhaustive\noverview of the task definition and challenges associated with model editing,\nalong with an in-depth empirical analysis of the most progressive methods\ncurrently at our disposal. We also build a new benchmark dataset to facilitate\na more robust evaluation and pinpoint enduring issues intrinsic to existing\ntechniques. Our objective is to provide valuable insights into the\neffectiveness and feasibility of each editing technique, thereby assisting the\ncommunity in making informed decisions on the selection of the most appropriate\nmethod for a specific task or context. Code and datasets are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Yunzhi Yao","Peng Wang","Bozhong Tian","Siyuan Cheng","Zhoubo Li","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13172v3.pdf","comment":"EMNLP 2023. Updated with new experiments"},{"id":"http://arxiv.org/abs/2305.03972v2","updated":"2023-11-30T07:16:00Z","published":"2023-05-06T08:12:11Z","title":"Image to Multi-Modal Retrieval for Industrial Scenarios","summary":" We formally define a novel valuable information retrieval task:\nimage-to-multi-modal-retrieval (IMMR), where the query is an image and the doc\nis an entity with both image and textual description. IMMR task is valuable in\nvarious industrial application. We analyze three key challenges for IMMR: 1)\nskewed data and noisy label in metric learning, 2) multi-modality fusion, 3)\neffective and efficient training in large-scale industrial scenario. To tackle\nthe above challenges, we propose a novel framework for IMMR task. Our framework\nconsists of three components: 1) a novel data governance scheme coupled with a\nlarge-scale classification-based learning paradigm. 2) model architecture\nspecially designed for multimodal learning, where the proposed concept-aware\nmodality fusion module adaptively fuse image and text modality. 3. a hybrid\nparallel training approach for tackling large-scale training in industrial\nscenario. The proposed framework achieves SOTA performance on public datasets\nand has been deployed in a real-world industrial search system, leading to\nsignificant improvements in click-through rate and deal number. Code and data\nwill be made publicly available.\n","authors":["Zida Cheng","Chen Ju","Xu Chen","Zhonghua Zhai","Shuai Xiao","Xiaoyi Zeng","Weilin Huang"],"pdf_url":"https://arxiv.org/pdf/2305.03972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18244v1","updated":"2023-11-30T04:25:28Z","published":"2023-11-30T04:25:28Z","title":"Poisoning Attacks Against Contrastive Recommender Systems","summary":" Contrastive learning (CL) has recently gained significant popularity in the\nfield of recommendation. Its ability to learn without heavy reliance on labeled\ndata is a natural antidote to the data sparsity issue. Previous research has\nfound that CL can not only enhance recommendation accuracy but also\ninadvertently exhibit remarkable robustness against noise. However, this paper\nidentifies a vulnerability of CL-based recommender systems: Compared with their\nnon-CL counterparts, they are even more susceptible to poisoning attacks that\naim to promote target items. Our analysis points to the uniform dispersion of\nrepresentations led by the CL loss as the very factor that accounts for this\nvulnerability. We further theoretically and empirically demonstrate that the\noptimization of CL loss can lead to smooth spectral values of representations.\nBased on these insights, we attempt to reveal the potential poisoning attacks\nagainst CL-based recommender systems. The proposed attack encompasses a\ndual-objective framework: One that induces a smoother spectral value\ndistribution to amplify the CL loss's inherent dispersion effect, named\ndispersion promotion; and the other that directly elevates the visibility of\ntarget items, named rank promotion. We validate the destructiveness of our\nattack model through extensive experimentation on four datasets. By shedding\nlight on these vulnerabilities, we aim to facilitate the development of more\nrobust CL-based recommender systems.\n","authors":["Zongwei Wang","Junliang Yu","Min Gao","Hongzhi Yin","Bin Cui","Shazia Sadiq"],"pdf_url":"https://arxiv.org/pdf/2311.18244v1.pdf","comment":"14pages,6 figures,5 tables"},{"id":"http://arxiv.org/abs/2311.18213v1","updated":"2023-11-30T03:13:36Z","published":"2023-11-30T03:13:36Z","title":"Beyond Two-Tower Matching: Learning Sparse Retrievable\n Cross-Interactions for Recommendation","summary":" Two-tower models are a prevalent matching framework for recommendation, which\nhave been widely deployed in industrial applications. The success of two-tower\nmatching attributes to its efficiency in retrieval among a large number of\nitems, since the item tower can be precomputed and used for fast Approximate\nNearest Neighbor (ANN) search. However, it suffers two main challenges,\nincluding limited feature interaction capability and reduced accuracy in online\nserving. Existing approaches attempt to design novel late interactions instead\nof dot products, but they still fail to support complex feature interactions or\nlose retrieval efficiency. To address these challenges, we propose a new\nmatching paradigm named SparCode, which supports not only sophisticated feature\ninteractions but also efficient retrieval. Specifically, SparCode introduces an\nall-to-all interaction module to model fine-grained query-item interactions.\nBesides, we design a discrete code-based sparse inverted index jointly trained\nwith the model to achieve effective and efficient model inference. Extensive\nexperiments have been conducted on open benchmark datasets to demonstrate the\nsuperiority of our framework. The results show that SparCode significantly\nimproves the accuracy of candidate item matching while retaining the same level\nof retrieval efficiency with two-tower models. Our source code will be\navailable at MindSpore/models.\n","authors":["Liangcai Su","Fan Yan","Jieming Zhu","Xi Xiao","Haoyi Duan","Zhou Zhao","Zhenhua Dong","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2311.18213v1.pdf","comment":"Accepted by SIGIR 2023. Code will be available at\n https://reczoo.github.io/SparCode"},{"id":"http://arxiv.org/abs/2110.15114v2","updated":"2023-11-30T02:52:40Z","published":"2021-10-28T13:53:56Z","title":"UltraGCN: Ultra Simplification of Graph Convolutional Networks for\n Recommendation","summary":" With the recent success of graph convolutional networks (GCNs), they have\nbeen widely applied for recommendation, and achieved impressive performance\ngains. The core of GCNs lies in its message passing mechanism to aggregate\nneighborhood information. However, we observed that message passing largely\nslows down the convergence of GCNs during training, especially for large-scale\nrecommender systems, which hinders their wide adoption. LightGCN makes an early\nattempt to simplify GCNs for collaborative filtering by omitting feature\ntransformations and nonlinear activations. In this paper, we take one step\nfurther to propose an ultra-simplified formulation of GCNs (dubbed UltraGCN),\nwhich skips infinite layers of message passing for efficient recommendation.\nInstead of explicit message passing, UltraGCN resorts to directly approximate\nthe limit of infinite-layer graph convolutions via a constraint loss.\nMeanwhile, UltraGCN allows for more appropriate edge weight assignments and\nflexible adjustment of the relative importances among different types of\nrelationships. This finally yields a simple yet effective UltraGCN model, which\nis easy to implement and efficient to train. Experimental results on four\nbenchmark datasets show that UltraGCN not only outperforms the state-of-the-art\nGCN models but also achieves more than 10x speedup over LightGCN. Our source\ncode will be available at https://reczoo.github.io/UltraGCN.\n","authors":["Kelong Mao","Jieming Zhu","Xi Xiao","Biao Lu","Zhaowei Wang","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2110.15114v2.pdf","comment":"Accepted by CIKM 2021. Code available at:\n https://reczoo.github.io/UltraGCN"},{"id":"http://arxiv.org/abs/2109.12613v3","updated":"2023-11-30T02:38:32Z","published":"2021-09-26T14:09:25Z","title":"SimpleX: A Simple and Strong Baseline for Collaborative Filtering","summary":" Collaborative filtering (CF) is a widely studied research topic in\nrecommender systems. The learning of a CF model generally depends on three\nmajor components, namely interaction encoder, loss function, and negative\nsampling. While many existing studies focus on the design of more powerful\ninteraction encoders, the impacts of loss functions and negative sampling\nratios have not yet been well explored. In this work, we show that the choice\nof loss function as well as negative sampling ratio is equivalently important.\nMore specifically, we propose the cosine contrastive loss (CCL) and further\nincorporate it to a simple unified CF model, dubbed SimpleX. Extensive\nexperiments have been conducted on 11 benchmark datasets and compared with 29\nexisting CF models in total. Surprisingly, the results show that, under our CCL\nloss and a large negative sampling ratio, SimpleX can surpass most\nsophisticated state-of-the-art models by a large margin (e.g., max 48.5%\nimprovement in NDCG@20 over LightGCN). We believe that SimpleX could not only\nserve as a simple strong baseline to foster future research on CF, but also\nshed light on the potential research direction towards improving loss function\nand negative sampling. Our source code will be available at\nhttps://reczoo.github.io/SimpleX.\n","authors":["Kelong Mao","Jieming Zhu","Jinpeng Wang","Quanyu Dai","Zhenhua Dong","Xi Xiao","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2109.12613v3.pdf","comment":"Accepted by CIKM 2021. Code available at\n https://reczoo.github.io/SimpleX"},{"id":"http://arxiv.org/abs/2311.18195v1","updated":"2023-11-30T02:27:34Z","published":"2023-11-30T02:27:34Z","title":"COVID-19 Vaccine Misinformation in Middle Income Countries","summary":" This paper introduces a multilingual dataset of COVID-19 vaccine\nmisinformation, consisting of annotated tweets from three middle-income\ncountries: Brazil, Indonesia, and Nigeria. The expertly curated dataset\nincludes annotations for 5,952 tweets, assessing their relevance to COVID-19\nvaccines, presence of misinformation, and the themes of the misinformation. To\naddress challenges posed by domain specificity, the low-resource setting, and\ndata imbalance, we adopt two approaches for developing COVID-19 vaccine\nmisinformation detection models: domain-specific pre-training and text\naugmentation using a large language model. Our best misinformation detection\nmodels demonstrate improvements ranging from 2.7 to 15.9 percentage points in\nmacro F1-score compared to the baseline models. Additionally, we apply our\nmisinformation detection models in a large-scale study of 19 million unlabeled\ntweets from the three countries between 2020 and 2022, showcasing the practical\napplication of our dataset and models for detecting and analyzing vaccine\nmisinformation in multiple countries and languages. Our analysis indicates that\npercentage changes in the number of new COVID-19 cases are positively\nassociated with COVID-19 vaccine misinformation rates in a staggered manner for\nBrazil and Indonesia, and there are significant positive associations between\nthe misinformation rates across the three countries.\n","authors":["Jongin Kim","Byeo Rhee Back","Aditya Agrawal","Jiaxi Wu","Veronika J. Wirtz","Traci Hong","Derry Wijaya"],"pdf_url":"https://arxiv.org/pdf/2311.18195v1.pdf","comment":"Accepted to EMNLP 2023 (Main conference), 9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2304.00902v4","updated":"2023-11-30T02:27:15Z","published":"2023-04-03T11:46:30Z","title":"FinalMLP: An Enhanced Two-Stream MLP Model for CTR Prediction","summary":" Click-through rate (CTR) prediction is one of the fundamental tasks for\nonline advertising and recommendation. While multi-layer perceptron (MLP)\nserves as a core component in many deep CTR prediction models, it has been\nwidely recognized that applying a vanilla MLP network alone is inefficient in\nlearning multiplicative feature interactions. As such, many two-stream\ninteraction models (e.g., DeepFM and DCN) have been proposed by integrating an\nMLP network with another dedicated network for enhanced CTR prediction. As the\nMLP stream learns feature interactions implicitly, existing research focuses\nmainly on enhancing explicit feature interactions in the complementary stream.\nIn contrast, our empirical study shows that a well-tuned two-stream MLP model\nthat simply combines two MLPs can even achieve surprisingly good performance,\nwhich has never been reported before by existing work. Based on this\nobservation, we further propose feature gating and interaction aggregation\nlayers that can be easily plugged to make an enhanced two-stream MLP model,\nFinalMLP. In this way, it not only enables differentiated feature inputs but\nalso effectively fuses stream-level interactions across two streams. Our\nevaluation results on four open benchmark datasets as well as an online A/B\ntest in our industrial system show that FinalMLP achieves better performance\nthan many sophisticated two-stream CTR models. Our source code will be\navailable at MindSpore/models.\n","authors":["Kelong Mao","Jieming Zhu","Liangcai Su","Guohao Cai","Yuru Li","Zhenhua Dong"],"pdf_url":"https://arxiv.org/pdf/2304.00902v4.pdf","comment":"Accepted by AAAI 2023. Code available at\n https://reczoo.github.io/FinalMLP"},{"id":"http://arxiv.org/abs/2306.08808v3","updated":"2023-11-30T02:12:03Z","published":"2023-06-15T01:51:06Z","title":"ReLoop2: Building Self-Adaptive Recommendation Models via Responsive\n Error Compensation Loop","summary":" Industrial recommender systems face the challenge of operating in\nnon-stationary environments, where data distribution shifts arise from evolving\nuser behaviors over time. To tackle this challenge, a common approach is to\nperiodically re-train or incrementally update deployed deep models with newly\nobserved data, resulting in a continual training process. However, the\nconventional learning paradigm of neural networks relies on iterative\ngradient-based updates with a small learning rate, making it slow for large\nrecommendation models to adapt. In this paper, we introduce ReLoop2, a\nself-correcting learning loop that facilitates fast model adaptation in online\nrecommender systems through responsive error compensation. Inspired by the\nslow-fast complementary learning system observed in human brains, we propose an\nerror memory module that directly stores error samples from incoming data\nstreams. These stored samples are subsequently leveraged to compensate for\nmodel prediction errors during testing, particularly under distribution shifts.\nThe error memory module is designed with fast access capabilities and undergoes\ncontinual refreshing with newly observed data samples during the model serving\nphase to support fast model adaptation. We evaluate the effectiveness of\nReLoop2 on three open benchmark datasets as well as a real-world production\ndataset. The results demonstrate the potential of ReLoop2 in enhancing the\nresponsiveness and adaptiveness of recommender systems operating in\nnon-stationary environments.\n","authors":["Jieming Zhu","Guohao Cai","Junjie Huang","Zhenhua Dong","Ruiming Tang","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.08808v3.pdf","comment":"Accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2009.05794v5","updated":"2023-11-30T02:01:41Z","published":"2020-09-12T13:34:22Z","title":"BARS-CTR: Open Benchmarking for Click-Through Rate Prediction","summary":" Click-through rate (CTR) prediction is a critical task for many applications,\nas its accuracy has a direct impact on user experience and platform revenue. In\nrecent years, CTR prediction has been widely studied in both academia and\nindustry, resulting in a wide variety of CTR prediction models. Unfortunately,\nthere is still a lack of standardized benchmarks and uniform evaluation\nprotocols for CTR prediction research. This leads to non-reproducible or even\ninconsistent experimental results among existing studies, which largely limits\nthe practical value and potential impact of their research. In this work, we\naim to perform open benchmarking for CTR prediction and present a rigorous\ncomparison of different models in a reproducible manner. To this end, we ran\nover 7,000 experiments for more than 12,000 GPU hours in total to re-evaluate\n24 existing models on multiple datasets and settings. Surprisingly, our\nexperiments show that with sufficient hyper-parameter search and model tuning,\nmany deep models have smaller differences than expected. The results also\nreveal that making real progress on the modeling of CTR prediction is indeed a\nvery challenging research task. We believe that our benchmarking work could not\nonly allow researchers to gauge the effectiveness of new models conveniently\nbut also make them fairly compare with the state of the arts. We have publicly\nreleased the benchmarking code, evaluation protocols, and hyper-parameter\nsettings of our work to promote reproducible research in this field.\n","authors":["Jieming Zhu","Jinyang Liu","Shuai Yang","Qi Zhang","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2009.05794v5.pdf","comment":"Accepted by CIKM 2021. See the benchmark at\n https://openbenchmark.github.io/BARS/CTR"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.18838v1","updated":"2023-11-30T18:59:56Z","published":"2023-11-30T18:59:56Z","title":"Dataset Distillation in Large Data Era","summary":" Dataset distillation aims to generate a smaller but representative subset\nfrom a large dataset, which allows a model to be trained efficiently, meanwhile\nevaluating on the original testing data distribution to achieve decent\nperformance. Many prior works have aimed to align with diverse aspects of the\noriginal datasets, such as matching the training weight trajectories, gradient,\nfeature/BatchNorm distributions, etc. In this work, we show how to distill\nvarious large-scale datasets such as full ImageNet-1K/21K under a conventional\ninput resolution of 224$\\times$224 to achieve the best accuracy over all\nprevious approaches, including SRe$^2$L, TESLA and MTT. To achieve this, we\nintroduce a simple yet effective ${\\bf C}$urriculum ${\\bf D}$ata ${\\bf\nA}$ugmentation ($\\texttt{CDA}$) during data synthesis that obtains the accuracy\non large-scale ImageNet-1K and 21K with 63.2% under IPC (Images Per Class) 50\nand 36.1% under IPC 20, respectively. Finally, we show that, by integrating all\nour enhancements together, the proposed model beats the current\nstate-of-the-art by more than 4% Top-1 accuracy on ImageNet-1K/21K and for the\nfirst time, reduces the gap to its full-data training counterpart to less than\nabsolute 15%. Moreover, this work represents the inaugural success in dataset\ndistillation on larger-scale ImageNet-21K under the standard 224$\\times$224\nresolution. Our code and distilled ImageNet-21K dataset of 20 IPC, 2K recovery\nbudget are available at https://github.com/VILA-Lab/SRe2L/tree/main/CDA.\n","authors":["Zeyuan Yin","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2311.18838v1.pdf","comment":"Code and distilled ImageNet-21K dataset are available at\n https://github.com/VILA-Lab/SRe2L/tree/main/CDA"},{"id":"http://arxiv.org/abs/2311.18837v1","updated":"2023-11-30T18:59:52Z","published":"2023-11-30T18:59:52Z","title":"VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion\n Models","summary":" Diffusion models have achieved significant success in image and video\ngeneration. This motivates a growing interest in video editing tasks, where\nvideos are edited according to provided text descriptions. However, most\nexisting approaches only focus on video editing for short clips and rely on\ntime-consuming tuning or inference. We are the first to propose Video\nInstruction Diffusion (VIDiff), a unified foundation model designed for a wide\nrange of video tasks. These tasks encompass both understanding tasks (such as\nlanguage-guided video object segmentation) and generative tasks (video editing\nand enhancement). Our model can edit and translate the desired results within\nseconds based on user instructions. Moreover, we design an iterative\nauto-regressive method to ensure consistency in editing and enhancing long\nvideos. We provide convincing generative results for diverse input videos and\nwritten instructions, both qualitatively and quantitatively. More examples can\nbe found at our website https://ChenHsing.github.io/VIDiff.\n","authors":["Zhen Xing","Qi Dai","Zihao Zhang","Hui Zhang","Han Hu","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.18837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18827v1","updated":"2023-11-30T18:59:06Z","published":"2023-11-30T18:59:06Z","title":"Motion-Conditioned Image Animation for Video Editing","summary":" We introduce MoCA, a Motion-Conditioned Image Animation approach for video\nediting. It leverages a simple decomposition of the video editing problem into\nimage editing followed by motion-conditioned image animation. Furthermore,\ngiven the lack of robust evaluation datasets for video editing, we introduce a\nnew benchmark that measures edit capability across a wide variety of tasks,\nsuch as object replacement, background changes, style changes, and motion\nedits. We present a comprehensive human evaluation of the latest video editing\nmethods along with MoCA, on our proposed benchmark. MoCA establishes a new\nstate-of-the-art, demonstrating greater human preference win-rate, and\noutperforming notable recent approaches including Dreamix (63%), MasaCtrl\n(75%), and Tune-A-Video (72%), with especially significant improvements for\nmotion edits.\n","authors":["Wilson Yan","Andrew Brown","Pieter Abbeel","Rohit Girdhar","Samaneh Azadi"],"pdf_url":"https://arxiv.org/pdf/2311.18827v1.pdf","comment":"Project page: https://facebookresearch.github.io/MoCA"},{"id":"http://arxiv.org/abs/2311.18826v1","updated":"2023-11-30T18:59:05Z","published":"2023-11-30T18:59:05Z","title":"Geometry-Aware Normalizing Wasserstein Flows for Optimal Causal\n Inference","summary":" This manuscript enriches the framework of continuous normalizing flows (CNFs)\nwithin causal inference, primarily to augment the geometric properties of\nparametric submodels used in targeted maximum likelihood estimation (TMLE). By\nintroducing an innovative application of CNFs, we construct a refined series of\nparametric submodels that enable a directed interpolation between the prior\ndistribution $p_0$ and the empirical distribution $p_1$. This proposed\nmethodology serves to optimize the semiparametric efficiency bound in causal\ninference by orchestrating CNFs to align with Wasserstein gradient flows. Our\napproach not only endeavors to minimize the mean squared error in the\nestimation but also imbues the estimators with geometric sophistication,\nthereby enhancing robustness against misspecification. This robustness is\ncrucial, as it alleviates the dependence on the standard $n^{\\frac{1}{4}}$ rate\nfor a doubly-robust perturbation direction in TMLE. By incorporating robust\noptimization principles and differential geometry into the estimators, the\ndeveloped geometry-aware CNFs represent a significant advancement in the\npursuit of doubly robust causal inference.\n","authors":["Kaiwen Hou"],"pdf_url":"https://arxiv.org/pdf/2311.18826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17898v2","updated":"2023-11-30T18:59:01Z","published":"2023-11-29T18:51:46Z","title":"Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis","summary":" Hallucinations and unfaithful synthesis due to inaccurate prompts with\ninsufficient semantic details are widely observed in multimodal generative\nmodels. A prevalent strategy to align multiple modalities is to fine-tune the\ngenerator with a large number of annotated text-image pairs. However, such a\nprocedure is labor-consuming and resource-draining. The key question we ask is:\ncan we enhance the quality and faithfulness of text-driven generative models\nbeyond extensive text-image pair annotations? To address this question, we\npropose Knowledge Pursuit Prompting (KPP), a zero-shot framework that\niteratively incorporates external knowledge to help generators produce reliable\nvisual content. Instead of training generators to handle generic prompts, KPP\nemploys a recursive knowledge query process to gather informative external\nfacts from the knowledge base, instructs a language model to compress the\nacquired knowledge for prompt refinement, and utilizes text-driven generators\nfor visual synthesis. The entire process is zero-shot, without accessing the\narchitectures and parameters of generative models. We evaluate the framework\nacross multiple text-driven generative tasks (image, 3D rendering, and video)\non datasets of different domains. We further demonstrate the extensibility and\nadaptability of KPP through varying foundation model bases and instructions.\nOur results show that KPP is capable of generating faithful and semantically\nrich content across diverse visual domains, offering a promising solution to\nimprove multimodal generative models.\n","authors":["Jinqi Luo","Kwan Ho Ryan Chan","Dimitris Dimos","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2311.17898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18824v1","updated":"2023-11-30T18:58:38Z","published":"2023-11-30T18:58:38Z","title":"An Adaptive Framework for Generalizing Network Traffic Prediction\n towards Uncertain Environments","summary":" We have developed a new framework using time-series analysis for dynamically\nassigning mobile network traffic prediction models in previously unseen\nwireless environments. Our framework selectively employs learned behaviors,\noutperforming any single model with over a 50% improvement relative to current\nstudies. More importantly, it surpasses traditional approaches without needing\nprior knowledge of a cell. While this paper focuses on network traffic\nprediction using our adaptive forecasting framework, this framework can also be\napplied to other machine learning applications in uncertain environments.\n The framework begins with unsupervised clustering of time-series data to\nidentify unique trends and seasonal patterns. Subsequently, we apply supervised\nlearning for traffic volume prediction within each cluster. This specialization\ntowards specific traffic behaviors occurs without penalties from spatial and\ntemporal variations. Finally, the framework adaptively assigns trained models\nto new, previously unseen cells. By analyzing real-time measurements of a cell,\nour framework intelligently selects the most suitable cluster for that cell at\nany given time, with cluster assignment dynamically adjusting to\nspatio-temporal fluctuations.\n","authors":["Alexander Downey","Evren Tuna","Alkan Soysal"],"pdf_url":"https://arxiv.org/pdf/2311.18824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18823v1","updated":"2023-11-30T18:58:26Z","published":"2023-11-30T18:58:26Z","title":"Initializing Models with Larger Ones","summary":" Weight initialization plays an important role in neural network training.\nWidely used initialization methods are proposed and evaluated for networks that\nare trained from scratch. However, the growing number of pretrained models now\noffers new opportunities for tackling this classical problem of weight\ninitialization. In this work, we introduce weight selection, a method for\ninitializing smaller models by selecting a subset of weights from a pretrained\nlarger model. This enables the transfer of knowledge from pretrained weights to\nsmaller models. Our experiments demonstrate that weight selection can\nsignificantly enhance the performance of small models and reduce their training\ntime. Notably, it can also be used together with knowledge distillation. Weight\nselection offers a new approach to leverage the power of pretrained models in\nresource-constrained settings, and we hope it can be a useful tool for training\nsmall models in the large-model era. Code is available at\nhttps://github.com/OscarXZQ/weight-selection.\n","authors":["Zhiqiu Xu","Yanjie Chen","Kirill Vishniakov","Yida Yin","Zhiqiang Shen","Trevor Darrell","Lingjie Liu","Zhuang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18817v1","updated":"2023-11-30T18:55:38Z","published":"2023-11-30T18:55:38Z","title":"Dichotomy of Early and Late Phase Implicit Biases Can Provably Induce\n Grokking","summary":" Recent work by Power et al. (2022) highlighted a surprising \"grokking\"\nphenomenon in learning arithmetic tasks: a neural net first \"memorizes\" the\ntraining set, resulting in perfect training accuracy but near-random test\naccuracy, and after training for sufficiently longer, it suddenly transitions\nto perfect test accuracy. This paper studies the grokking phenomenon in\ntheoretical setups and shows that it can be induced by a dichotomy of early and\nlate phase implicit biases. Specifically, when training homogeneous neural nets\nwith large initialization and small weight decay on both classification and\nregression tasks, we prove that the training process gets trapped at a solution\ncorresponding to a kernel predictor for a long time, and then a very sharp\ntransition to min-norm/max-margin predictors occurs, leading to a dramatic\nchange in test accuracy.\n","authors":["Kaifeng Lyu","Jikai Jin","Zhiyuan Li","Simon S. Du","Jason D. Lee","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2311.18817v1.pdf","comment":"39 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.18807v1","updated":"2023-11-30T18:52:10Z","published":"2023-11-30T18:52:10Z","title":"Pre-registration for Predictive Modeling","summary":" Amid rising concerns of reproducibility and generalizability in predictive\nmodeling, we explore the possibility and potential benefits of introducing\npre-registration to the field. Despite notable advancements in predictive\nmodeling, spanning core machine learning tasks to various scientific\napplications, challenges such as overlooked contextual factors, data-dependent\ndecision-making, and unintentional re-use of test data have raised questions\nabout the integrity of results. To address these issues, we propose adapting\npre-registration practices from explanatory modeling to predictive modeling. We\ndiscuss current best practices in predictive modeling and their limitations,\nintroduce a lightweight pre-registration template, and present a qualitative\nstudy with machine learning researchers to gain insight into the effectiveness\nof pre-registration in preventing biased estimates and promoting more reliable\nresearch outcomes. We conclude by exploring the scope of problems that\npre-registration can address in predictive modeling and acknowledging its\nlimitations within this context.\n","authors":["Jake M. Hofman","Angelos Chatzimparmpas","Amit Sharma","Duncan J. Watts","Jessica Hullman"],"pdf_url":"https://arxiv.org/pdf/2311.18807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18806v1","updated":"2023-11-30T18:51:50Z","published":"2023-11-30T18:51:50Z","title":"Efficient Baseline for Quantitative Precipitation Forecasting in\n Weather4cast 2023","summary":" Accurate precipitation forecasting is indispensable for informed\ndecision-making across various industries. However, the computational demands\nof current models raise environmental concerns. We address the critical need\nfor accurate precipitation forecasting while considering the environmental\nimpact of computational resources and propose a minimalist U-Net architecture\nto be used as a baseline for future weather forecasting initiatives.\n","authors":["Akshay Punjabi","Pablo Izquierdo Ayala"],"pdf_url":"https://arxiv.org/pdf/2311.18806v1.pdf","comment":"5 pages, 1 figure, Weather4Cast 2023 challenge"},{"id":"http://arxiv.org/abs/2311.18803v1","updated":"2023-11-30T18:49:43Z","published":"2023-11-30T18:49:43Z","title":"BIOCLIP: A Vision Foundation Model for the Tree of Life","summary":" Images of the natural world, collected by a variety of cameras, from drones\nto individual phones, are increasingly abundant sources of biological\ninformation. There is an explosion of computational methods and tools,\nparticularly computer vision, for extracting biologically relevant information\nfrom images for science and conservation. Yet most of these are bespoke\napproaches designed for a specific task and are not easily adaptable or\nextendable to new questions, contexts, and datasets. A vision model for general\norganismal biology questions on images is of timely need. To approach this, we\ncurate and release TreeOfLife-10M, the largest and most diverse ML-ready\ndataset of biology images. We then develop BioCLIP, a foundation model for the\ntree of life, leveraging the unique properties of biology captured by\nTreeOfLife-10M, namely the abundance and variety of images of plants, animals,\nand fungi, together with the availability of rich structured biological\nknowledge. We rigorously benchmark our approach on diverse fine-grained biology\nclassification tasks, and find that BioCLIP consistently and substantially\noutperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation\nreveals that BioCLIP has learned a hierarchical representation conforming to\nthe tree of life, shedding light on its strong generalizability. Our code,\nmodels and data will be made available at\nhttps://github.com/Imageomics/bioclip.\n","authors":["Samuel Stevens","Jiaman Wu","Matthew J Thompson","Elizabeth G Campolongo","Chan Hee Song","David Edward Carlyn","Li Dong","Wasila M Dahdul","Charles Stewart","Tanya Berger-Wolf","Wei-Lun Chao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2311.18803v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2311.14743v3","updated":"2023-11-30T18:48:24Z","published":"2023-11-21T18:41:26Z","title":"A Baseline Analysis of Reward Models' Ability To Accurately Analyze\n Foundation Models Under Distribution Shift","summary":" Foundation models, specifically Large Language Models (LLM's), have lately\ngained wide-spread attention and adoption. Reinforcement Learning with Human\nFeedback (RLHF) involves training a reward model to capture desired behaviors,\nwhich is then used to align an LLM. These reward models are additionally used\nat inference-time to estimate how well LLM responses adhere to those desired\nbehaviors. However, there is little work measuring how robust these reward\nmodels are to distribution shifts. In this work, we evaluate how reward model\nperformance - measured via accuracy and calibration (i.e. alignment between\naccuracy and confidence) - is affected by distribution shift. We show novel\ncalibration patterns and accuracy drops due to OOD prompts and responses, and\nthat the reward model is more sensitive to shifts in responses than prompts.\nAdditionally, we adapt an OOD detection technique commonly used in\nclassification to the reward model setting in order to detect these\ndistribution shifts in prompts and responses.\n","authors":["Ben Pikus","Will LeVine","Tony Chen","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2311.14743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18787v1","updated":"2023-11-30T18:37:15Z","published":"2023-11-30T18:37:15Z","title":"Communication-Efficient Federated Optimization over Semi-Decentralized\n Networks","summary":" In large-scale federated and decentralized learning, communication efficiency\nis one of the most challenging bottlenecks. While gossip communication -- where\nagents can exchange information with their connected neighbors -- is more\ncost-effective than communicating with the remote server, it often requires a\ngreater number of communication rounds, especially for large and sparse\nnetworks. To tackle the trade-off, we examine the communication efficiency\nunder a semi-decentralized communication protocol, in which agents can perform\nboth agent-to-agent and agent-to-server communication in a probabilistic\nmanner. We design a tailored communication-efficient algorithm over\nsemi-decentralized networks, referred to as PISCO, which inherits the\nrobustness to data heterogeneity thanks to gradient tracking and allows\nmultiple local updates for saving communication. We establish the convergence\nrate of PISCO for nonconvex problems and show that PISCO enjoys a linear\nspeedup in terms of the number of agents and local updates. Our numerical\nresults highlight the superior communication efficiency of PISCO and its\nresilience to data heterogeneity and various network topologies.\n","authors":["He Wang","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2311.18787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18780v1","updated":"2023-11-30T18:24:33Z","published":"2023-11-30T18:24:33Z","title":"MultiResFormer: Transformer with Adaptive Multi-Resolution Modeling for\n General Time Series Forecasting","summary":" Transformer-based models have greatly pushed the boundaries of time series\nforecasting recently. Existing methods typically encode time series data into\n$\\textit{patches}$ using one or a fixed set of patch lengths. This, however,\ncould result in a lack of ability to capture the variety of intricate temporal\ndependencies present in real-world multi-periodic time series. In this paper,\nwe propose MultiResFormer, which dynamically models temporal variations by\nadaptively choosing optimal patch lengths. Concretely, at the beginning of each\nlayer, time series data is encoded into several parallel branches, each using a\ndetected periodicity, before going through the transformer encoder block. We\nconduct extensive evaluations on long- and short-term forecasting datasets\ncomparing MultiResFormer with state-of-the-art baselines. MultiResFormer\noutperforms patch-based Transformer baselines on long-term forecasting tasks\nand also consistently outperforms CNN baselines by a large margin, while using\nmuch fewer parameters than these baselines.\n","authors":["Linfeng Du","Ji Xin","Alex Labach","Saba Zuberi","Maksims Volkovs","Rahul G. Krishnan"],"pdf_url":"https://arxiv.org/pdf/2311.18780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18775v1","updated":"2023-11-30T18:21:25Z","published":"2023-11-30T18:21:25Z","title":"CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation","summary":" We present CoDi-2, a versatile and interactive Multimodal Large Language\nModel (MLLM) that can follow complex multimodal interleaved instructions,\nconduct in-context learning (ICL), reason, chat, edit, etc., in an any-to-any\ninput-output modality paradigm. By aligning modalities with language for both\nencoding and generation, CoDi-2 empowers Large Language Models (LLMs) to not\nonly understand complex modality-interleaved instructions and in-context\nexamples, but also autoregressively generate grounded and coherent multimodal\noutputs in the continuous feature space. To train CoDi-2, we build a\nlarge-scale generation dataset encompassing in-context multimodal instructions\nacross text, vision, and audio. CoDi-2 demonstrates a wide range of zero-shot\ncapabilities for multimodal generation, such as in-context learning, reasoning,\nand compositionality of any-to-any modality generation through multi-round\ninteractive conversation. CoDi-2 surpasses previous domain-specific models on\ntasks such as subject-driven image generation, vision transformation, and audio\nediting. CoDi-2 signifies a substantial breakthrough in developing a\ncomprehensive multimodal foundation model adept at interpreting in-context\nlanguage-vision-audio interleaved instructions and producing multimodal\noutputs.\n","authors":["Zineng Tang","Ziyi Yang","Mahmoud Khademi","Yang Liu","Chenguang Zhu","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2311.18775v1.pdf","comment":"Project Page: https://codi-2.github.io/"},{"id":"http://arxiv.org/abs/2307.14680v2","updated":"2023-11-30T18:18:50Z","published":"2023-07-27T08:10:19Z","title":"TimeGNN: Temporal Dynamic Graph Learning for Time Series Forecasting","summary":" Time series forecasting lies at the core of important real-world applications\nin many fields of science and engineering. The abundance of large time series\ndatasets that consist of complex patterns and long-term dependencies has led to\nthe development of various neural network architectures. Graph neural network\napproaches, which jointly learn a graph structure based on the correlation of\nraw values of multivariate time series while forecasting, have recently seen\ngreat success. However, such solutions are often costly to train and difficult\nto scale. In this paper, we propose TimeGNN, a method that learns dynamic\ntemporal graph representations that can capture the evolution of inter-series\npatterns along with the correlations of multiple series. TimeGNN achieves\ninference times 4 to 80 times faster than other state-of-the-art graph-based\nmethods while achieving comparable forecasting performance\n","authors":["Nancy Xu","Chrysoula Kosma","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2307.14680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16111v2","updated":"2023-11-30T18:13:01Z","published":"2023-10-24T18:25:13Z","title":"Locally Differentially Private Document Generation Using Zero Shot\n Prompting","summary":" Numerous studies have highlighted the privacy risks associated with\npretrained large language models. In contrast, our research offers a unique\nperspective by demonstrating that pretrained large language models can\neffectively contribute to privacy preservation. We propose a locally\ndifferentially private mechanism called DP-Prompt, which leverages the power of\npretrained large language models and zero-shot prompting to counter author\nde-anonymization attacks while minimizing the impact on downstream utility.\nWhen DP-Prompt is used with a powerful language model like ChatGPT (gpt-3.5),\nwe observe a notable reduction in the success rate of de-anonymization attacks,\nshowing that it surpasses existing approaches by a considerable margin despite\nits simpler design. For instance, in the case of the IMDB dataset, DP-Prompt\n(with ChatGPT) perfectly recovers the clean sentiment F1 score while achieving\na 46\\% reduction in author identification F1 score against static attackers and\na 26\\% reduction against adaptive attackers. We conduct extensive experiments\nacross six open-source large language models, ranging up to 7 billion\nparameters, to analyze various effects of the privacy-utility tradeoff.\n","authors":["Saiteja Utpala","Sara Hooker","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.16111v2.pdf","comment":"Accepted at EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2311.18769v1","updated":"2023-11-30T18:08:16Z","published":"2023-11-30T18:08:16Z","title":"Online Change Points Detection for Linear Dynamical Systems with Finite\n Sample Guarantees","summary":" The problem of online change point detection is to detect abrupt changes in\nproperties of time series, ideally as soon as possible after those changes\noccur. Existing work on online change point detection either assumes i.i.d\ndata, focuses on asymptotic analysis, does not present theoretical guarantees\non the trade-off between detection accuracy and detection delay, or is only\nsuitable for detecting single change points. In this work, we study the online\nchange point detection problem for linear dynamical systems with unknown\ndynamics, where the data exhibits temporal correlations and the system could\nhave multiple change points. We develop a data-dependent threshold that can be\nused in our test that allows one to achieve a pre-specified upper bound on the\nprobability of making a false alarm. We further provide a finite-sample-based\nbound for the probability of detecting a change point. Our bound demonstrates\nhow parameters used in our algorithm affect the detection probability and\ndelay, and provides guidance on the minimum required time between changes to\nguarantee detection.\n","authors":["Lei Xin","George Chiu","Shreyas Sundaram"],"pdf_url":"https://arxiv.org/pdf/2311.18769v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2103.03808v4","updated":"2023-11-30T18:07:34Z","published":"2021-03-05T17:12:33Z","title":"Two-step reinforcement learning for model-free redesign of nonlinear\n optimal regulator","summary":" In many practical control applications, the performance level of a\nclosed-loop system degrades over time due to the change of plant\ncharacteristics. Thus, there is a strong need for redesigning a controller\nwithout going through the system modeling process, which is often difficult for\nclosed-loop systems. Reinforcement learning (RL) is one of the promising\napproaches that enable model-free redesign of optimal controllers for nonlinear\ndynamical systems based only on the measurement of the closed-loop system.\nHowever, the learning process of RL usually requires a considerable number of\ntrial-and-error experiments using the poorly controlled system that may\naccumulate wear on the plant. To overcome this limitation, we propose a\nmodel-free two-step design approach that improves the transient learning\nperformance of RL in an optimal regulator redesign problem for unknown\nnonlinear systems. Specifically, we first design a linear control law that\nattains some degree of control performance in a model-free manner, and then,\ntrain the nonlinear optimal control law with online RL by using the designed\nlinear control law in parallel. We introduce an offline RL algorithm for the\ndesign of the linear control law and theoretically guarantee its convergence to\nthe LQR controller under mild assumptions. Numerical simulations show that the\nproposed approach improves the transient learning performance and efficiency in\nhyperparameter tuning of RL.\n","authors":["Mei Minami","Yuka Masumoto","Yoshihiro Okawa","Tomotake Sasaki","Yutaka Hori"],"pdf_url":"https://arxiv.org/pdf/2103.03808v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18765v1","updated":"2023-11-30T18:05:52Z","published":"2023-11-30T18:05:52Z","title":"MLLMs-Augmented Visual-Language Representation Learning","summary":" Visual-language pre-training (VLP) have achieved remarkable success in\nmulti-modal tasks, largely attributed to the availability of large-scale\nimage-text datasets. In this work, we demonstrate that multi-modal large\nlanguage models (MLLMs) can enhance visual-language representation learning by\nimproving data quality. Our approach is simple, utilizing MLLMs to extend\nmultiple captions for each image. To prevent the bias that introduced by MLLMs'\nhallucinations and intrinsic caption styles, we propose a \"text shearing\" to\nkeep the lengths of extended captions identical to the originals. In image-text\nretrieval, our method consistently obtains 5.6 ~ 35.0% and 16.8 ~ 46.1%\nimprovement on R@1 under the fine-tuning and zero-shot settings, respectively.\nNotably, our zero-shot results are comparable to fine-tuning on target\ndatasets, which encourages more exploration on the versatile use of MLLMs.\n","authors":["Yanqing Liu","Kai Wang","Wenqi Shao","Ping Luo","Yu Qiao","Mike Zheng Shou","Kaipeng Zhang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2311.18765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18763v1","updated":"2023-11-30T18:04:21Z","published":"2023-11-30T18:04:21Z","title":"Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters","summary":" Recent work has demonstrated a remarkable ability to customize text-to-image\ndiffusion models to multiple, fine-grained concepts in a sequential (i.e.,\ncontinual) manner while only providing a few example images for each concept.\nThis setting is known as continual diffusion. Here, we ask the question: Can we\nscale these methods to longer concept sequences without forgetting? Although\nprior work mitigates the forgetting of previously learned concepts, we show\nthat its capacity to learn new tasks reaches saturation over longer sequences.\nWe address this challenge by introducing a novel method, STack-And-Mask\nINcremental Adapters (STAMINA), which is composed of low-ranked\nattention-masked adapters and customized MLP tokens. STAMINA is designed to\nenhance the robust fine-tuning properties of LoRA for sequential concept\nlearning via learnable hard-attention masks parameterized with low rank MLPs,\nenabling precise, scalable learning via sparse adaptation. Notably, all\nintroduced trainable parameters can be folded back into the model after\ntraining, inducing no additional inference parameter costs. We show that\nSTAMINA outperforms the prior SOTA for the setting of text-to-image continual\ncustomization on a 50-concept benchmark composed of landmarks and human faces,\nwith no stored replay data. Additionally, we extended our method to the setting\nof continual learning for image classification, demonstrating that our gains\nalso translate to state-of-the-art performance in this standard benchmark.\n","authors":["James Seale Smith","Yen-Chang Hsu","Zsolt Kira","Yilin Shen","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2311.18763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18751v1","updated":"2023-11-30T17:50:47Z","published":"2023-11-30T17:50:47Z","title":"Language Model Agents Suffer from Compositional Generalization in Web\n Automation","summary":" Language model agents (LMA) recently emerged as a promising paradigm on\nmuti-step decision making tasks, often outperforming humans and other\nreinforcement learning agents. Despite the promise, their performance on\nreal-world applications that often involve combinations of tasks is still\nunderexplored. In this work, we introduce a new benchmark, called CompWoB -- 50\nnew compositional web automation tasks reflecting more realistic assumptions.\nWe show that while existing prompted LMAs (gpt-3.5-turbo or gpt-4) achieve\n94.0% average success rate on base tasks, their performance degrades to 24.9%\nsuccess rate on compositional tasks. On the other hand, transferred LMAs\n(finetuned only on base tasks) show less generalization gap, dropping from\n85.4% to 54.8%. By balancing data distribution across tasks, we train a new\nmodel, HTML-T5++, that surpasses human-level performance (95.2%) on MiniWoB,\nand achieves the best zero-shot performance on CompWoB (61.5%). While these\nhighlight the promise of small-scale finetuned and transferred models for\ncompositional generalization, their performance further degrades under\ndifferent instruction compositions changing combinational order. In contrast to\nthe recent remarkable success of LMA, our benchmark and detailed analysis\nemphasize the necessity of building LMAs that are robust and generalizable to\ntask compositionality for real-world deployment.\n","authors":["Hiroki Furuta","Yutaka Matsuo","Aleksandra Faust","Izzeddin Gur"],"pdf_url":"https://arxiv.org/pdf/2311.18751v1.pdf","comment":"Code:\n https://github.com/google-research/google-research/tree/master/compositional_rl/compwob"},{"id":"http://arxiv.org/abs/2311.02496v2","updated":"2023-11-30T17:47:04Z","published":"2023-11-04T19:41:50Z","title":"LocoMuJoCo: A Comprehensive Imitation Learning Benchmark for Locomotion","summary":" Imitation Learning (IL) holds great promise for enabling agile locomotion in\nembodied agents. However, many existing locomotion benchmarks primarily focus\non simplified toy tasks, often failing to capture the complexity of real-world\nscenarios and steering research toward unrealistic domains. To advance research\nin IL for locomotion, we present a novel benchmark designed to facilitate\nrigorous evaluation and comparison of IL algorithms. This benchmark encompasses\na diverse set of environments, including quadrupeds, bipeds, and\nmusculoskeletal human models, each accompanied by comprehensive datasets, such\nas real noisy motion capture data, ground truth expert data, and ground truth\nsub-optimal data, enabling evaluation across a spectrum of difficulty levels.\nTo increase the robustness of learned agents, we provide an easy interface for\ndynamics randomization and offer a wide range of partially observable tasks to\ntrain agents across different embodiments. Finally, we provide handcrafted\nmetrics for each task and ship our benchmark with state-of-the-art baseline\nalgorithms to ease evaluation and enable fast benchmarking.\n","authors":["Firas Al-Hafez","Guoping Zhao","Jan Peters","Davide Tateo"],"pdf_url":"https://arxiv.org/pdf/2311.02496v2.pdf","comment":"https://github.com/robfiras/loco-mujoco"},{"id":"http://arxiv.org/abs/2311.18749v1","updated":"2023-11-30T17:47:02Z","published":"2023-11-30T17:47:02Z","title":"TransCORALNet: A Two-Stream Transformer CORAL Networks for Supply Chain\n Credit Assessment Cold Start","summary":" This paper proposes an interpretable two-stream transformer CORAL networks\n(TransCORALNet) for supply chain credit assessment under the segment industry\nand cold start problem. The model aims to provide accurate credit assessment\nprediction for new supply chain borrowers with limited historical data. Here,\nthe two-stream domain adaptation architecture with correlation alignment\n(CORAL) loss is used as a core model and is equipped with transformer, which\nprovides insights about the learned features and allow efficient\nparallelization during training. Thanks to the domain adaptation capability of\nthe proposed model, the domain shift between the source and target domain is\nminimized. Therefore, the model exhibits good generalization where the source\nand target do not follow the same distribution, and a limited amount of target\nlabeled instances exist. Furthermore, we employ Local Interpretable\nModel-agnostic Explanations (LIME) to provide more insight into the model\nprediction and identify the key features contributing to supply chain credit\nassessment decisions. The proposed model addresses four significant supply\nchain credit assessment challenges: domain shift, cold start, imbalanced-class\nand interpretability. Experimental results on a real-world data set demonstrate\nthe superiority of TransCORALNet over a number of state-of-the-art baselines in\nterms of accuracy. The code is available on GitHub\nhttps://github.com/JieJieNiu/TransCORALN .\n","authors":["Jie Shi","Arno P. J. M. Siebes","Siamak Mehrkanoon"],"pdf_url":"https://arxiv.org/pdf/2311.18749v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.18746v1","updated":"2023-11-30T17:44:22Z","published":"2023-11-30T17:44:22Z","title":"A data-science pipeline to enable the Interpretability of Many-Objective\n Feature Selection","summary":" Many-Objective Feature Selection (MOFS) approaches use four or more\nobjectives to determine the relevance of a subset of features in a supervised\nlearning task. As a consequence, MOFS typically returns a large set of\nnon-dominated solutions, which have to be assessed by the data scientist in\norder to proceed with the final choice. Given the multi-variate nature of the\nassessment, which may include criteria (e.g. fairness) not related to\npredictive accuracy, this step is often not straightforward and suffers from\nthe lack of existing tools. For instance, it is common to make use of a tabular\npresentation of the solutions, which provide little information about the\ntrade-offs and the relations between criteria over the set of solutions.\n This paper proposes an original methodology to support data scientists in the\ninterpretation and comparison of the MOFS outcome by combining post-processing\nand visualisation of the set of solutions. The methodology supports the data\nscientist in the selection of an optimal feature subset by providing her with\nhigh-level information at three different levels: objectives, solutions, and\nindividual features.\n The methodology is experimentally assessed on two feature selection tasks\nadopting a GA-based MOFS with six objectives (number of selected features,\nbalanced accuracy, F1-Score, variance inflation factor, statistical parity, and\nequalised odds). The results show the added value of the methodology in the\nselection of the final subset of features.\n","authors":["Uchechukwu F. Njoku","Alberto Abelló","Besim Bilalli","Gianluca Bontempi"],"pdf_url":"https://arxiv.org/pdf/2311.18746v1.pdf","comment":"8 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2311.18744v1","updated":"2023-11-30T17:41:46Z","published":"2023-11-30T17:41:46Z","title":"$\\mathbb{Z}_2\\times \\mathbb{Z}_2$ Equivariant Quantum Neural Networks:\n Benchmarking against Classical Neural Networks","summary":" This paper presents a comprehensive comparative analysis of the performance\nof Equivariant Quantum Neural Networks (EQNN) and Quantum Neural Networks\n(QNN), juxtaposed against their classical counterparts: Equivariant Neural\nNetworks (ENN) and Deep Neural Networks (DNN). We evaluate the performance of\neach network with two toy examples for a binary classification task, focusing\non model complexity (measured by the number of parameters) and the size of the\ntraining data set. Our results show that the $\\mathbb{Z}_2\\times \\mathbb{Z}_2$\nEQNN and the QNN provide superior performance for smaller parameter sets and\nmodest training data samples.\n","authors":["Zhongtian Dong","Marçal Comajoan Cara","Gopal Ramesh Dahale","Roy T. Forestano","Sergei Gleyzer","Daniel Justice","Kyoungchul Kong","Tom Magorsch","Konstantin T. Matchev","Katia Matcheva","Eyup B. Unlu"],"pdf_url":"https://arxiv.org/pdf/2311.18744v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.18743v1","updated":"2023-11-30T17:41:30Z","published":"2023-11-30T17:41:30Z","title":"AlignBench: Benchmarking Chinese Alignment of Large Language Models","summary":" Alignment has become a critical step for instruction-tuned Large Language\nModels (LLMs) to become helpful assistants. However, effective evaluation of\nalignment for emerging Chinese LLMs is still significantly lacking, calling for\nreal-scenario grounded, open-ended, challenging and automatic evaluations\ntailored for alignment. To fill in this gap, we introduce AlignBench, a\ncomprehensive multi-dimensional benchmark for evaluating LLMs' alignment in\nChinese. Equipped with a human-in-the-loop data curation pipeline, our\nbenchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with\nChain-of-Thought to generate explanations and final ratings as evaluations,\nensuring high reliability and interpretability. Furthermore, we developed a\ndedicated companion evaluator LLM -- CritiqueLLM, which recovers 95\\% of\nGPT-4's evaluation ability and will be provided via public APIs to researchers\nfor evaluation of alignment in Chinese LLMs. All evaluation codes, data, and\nLLM generations are available at \\url{https://github.com/THUDM/AlignBench}.\n","authors":["Xiao Liu","Xuanyu Lei","Shengyuan Wang","Yue Huang","Zhuoer Feng","Bosi Wen","Jiale Cheng","Pei Ke","Yifan Xu","Weng Lam Tam","Xiaohan Zhang","Lichao Sun","Hongning Wang","Jing Zhang","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2311.18743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18741v1","updated":"2023-11-30T17:38:54Z","published":"2023-11-30T17:38:54Z","title":"VREM-FL: Mobility-Aware Computation-Scheduling Co-Design for Vehicular\n Federated Learning","summary":" Assisted and autonomous driving are rapidly gaining momentum, and will soon\nbecome a reality. Among their key enablers, artificial intelligence and machine\nlearning are expected to play a prominent role, also thanks to the massive\namount of data that smart vehicles will collect from their onboard sensors. In\nthis domain, federated learning is one of the most effective and promising\ntechniques for training global machine learning models, while preserving data\nprivacy at the vehicles and optimizing communications resource usage. In this\nwork, we propose VREM-FL, a computation-scheduling co-design for vehicular\nfederated learning that leverages mobility of vehicles in conjunction with\nestimated 5G radio environment maps. VREM-FL jointly optimizes the global model\nlearned at the server while wisely allocating communication resources. This is\nachieved by orchestrating local computations at the vehicles in conjunction\nwith the transmission of their local model updates in an adaptive and\npredictive fashion, by exploiting radio channel maps. The proposed algorithm\ncan be tuned to trade model training time for radio resource usage.\nExperimental results demonstrate the efficacy of utilizing radio maps. VREM-FL\noutperforms literature benchmarks for both a linear regression model (learning\ntime reduced by 28%) and a deep neural network for a semantic image\nsegmentation task (doubling the number of model updates within the same time\nwindow).\n","authors":["Luca Ballotta","Nicolò Dal Fabbro","Giovanni Perin","Luca Schenato","Michele Rossi","Giuseppe Piro"],"pdf_url":"https://arxiv.org/pdf/2311.18741v1.pdf","comment":"This work has been submitted to IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2311.18736v1","updated":"2023-11-30T17:34:05Z","published":"2023-11-30T17:34:05Z","title":"Controlgym: Large-Scale Safety-Critical Control Environments for\n Benchmarking Reinforcement Learning Algorithms","summary":" We introduce controlgym, a library of thirty-six safety-critical industrial\ncontrol settings, and ten infinite-dimensional partial differential equation\n(PDE)-based control problems. Integrated within the OpenAI Gym/Gymnasium (Gym)\nframework, controlgym allows direct applications of standard reinforcement\nlearning (RL) algorithms like stable-baselines3. Our control environments\ncomplement those in Gym with continuous, unbounded action and observation\nspaces, motivated by real-world control applications. Moreover, the PDE control\nenvironments uniquely allow the users to extend the state dimensionality of the\nsystem to infinity while preserving the intrinsic dynamics. This feature is\ncrucial for evaluating the scalability of RL algorithms for control. This\nproject serves the learning for dynamics & control (L4DC) community, aiming to\nexplore key questions: the convergence of RL algorithms in learning control\npolicies; the stability and robustness issues of learning-based controllers;\nand the scalability of RL algorithms to high- and potentially\ninfinite-dimensional systems. We open-source the controlgym project at\nhttps://github.com/xiangyuan-zhang/controlgym.\n","authors":["Xiangyuan Zhang","Weichao Mao","Saviz Mowlavi","Mouhacine Benosman","Tamer Başar"],"pdf_url":"https://arxiv.org/pdf/2311.18736v1.pdf","comment":"25 pages, 16 figures"},{"id":"http://arxiv.org/abs/2311.18735v1","updated":"2023-11-30T17:30:45Z","published":"2023-11-30T17:30:45Z","title":"Dimension Mixer: A Generalized Method for Structured Sparsity in Deep\n Neural Networks","summary":" The recent success of multiple neural architectures like CNNs, Transformers,\nand MLP-Mixers motivated us to look for similarities and differences between\nthem. We found that these architectures can be interpreted through the lens of\na general concept of dimension mixing. Research on coupling flows and the\nbutterfly transform shows that partial and hierarchical signal mixing schemes\nare sufficient for efficient and expressive function approximation. In this\nwork, we study group-wise sparse, non-linear, multi-layered and learnable\nmixing schemes of inputs and find that they are complementary to many standard\nneural architectures. Following our observations and drawing inspiration from\nthe Fast Fourier Transform, we generalize Butterfly Structure to use non-linear\nmixer function allowing for MLP as mixing function called Butterfly MLP. We\nwere also able to mix along sequence dimension for Transformer-based\narchitectures called Butterfly Attention. Experiments on CIFAR and LRA datasets\ndemonstrate that the proposed Non-Linear Butterfly Mixers are efficient and\nscale well when the host architectures are used as mixing function.\nAdditionally, we propose Patch-Only MLP-Mixer for processing spatial 2D signals\ndemonstrating a different dimension mixing strategy.\n","authors":["Suman Sapkota","Binod Bhattarai"],"pdf_url":"https://arxiv.org/pdf/2311.18735v1.pdf","comment":"11 pages, 4 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.18732v1","updated":"2023-11-30T17:27:22Z","published":"2023-11-30T17:27:22Z","title":"Indoor Millimeter Wave Localization using Multiple Self-Supervised Tiny\n Neural Networks","summary":" We consider the localization of a mobile millimeter-wave client in a large\nindoor environment using multilayer perceptron neural networks (NNs). Instead\nof training and deploying a single deep model, we proceed by choosing among\nmultiple tiny NNs trained in a self-supervised manner. The main challenge then\nbecomes to determine and switch to the best NN among the available ones, as an\nincorrect NN will fail to localize the client. In order to upkeep the\nlocalization accuracy, we propose two switching schemes: one based on a Kalman\nfilter, and one based on the statistical distribution of the training data. We\nanalyze the proposed schemes via simulations, showing that our approach\noutperforms both geometric localization schemes and the use of a single NN.\n","authors":["Anish Shastri","Andres Garcia-Saavedra","Paolo Casari"],"pdf_url":"https://arxiv.org/pdf/2311.18732v1.pdf","comment":"5 pages, 7 figures. Under Review"},{"id":"http://arxiv.org/abs/2311.18727v1","updated":"2023-11-30T17:23:40Z","published":"2023-11-30T17:23:40Z","title":"Automatic Functional Differentiation in JAX","summary":" We extend JAX with the capability to automatically differentiate higher-order\nfunctions (functionals and operators). By representing functions as a\ngeneralization of arrays, we seamlessly use JAX's existing primitive system to\nimplement higher-order functions. We present a set of primitive operators that\nserve as foundational building blocks for constructing several key types of\nfunctionals. For every introduced primitive operator, we derive and implement\nboth linearization and transposition rules, aligning with JAX's internal\nprotocols for forward and reverse mode automatic differentiation. This\nenhancement allows for functional differentiation in the same syntax\ntraditionally use for functions. The resulting functional gradients are\nthemselves functions ready to be invoked in python. We showcase this tool's\nefficacy and simplicity through applications where functional derivatives are\nindispensable. The source code of this work is released at\nhttps://github.com/sail-sg/autofd .\n","authors":["Min Lin"],"pdf_url":"https://arxiv.org/pdf/2311.18727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18725v1","updated":"2023-11-30T17:23:17Z","published":"2023-11-30T17:23:17Z","title":"AI in Pharma for Personalized Sequential Decision-Making: Methods,\n Applications and Opportunities","summary":" In the pharmaceutical industry, the use of artificial intelligence (AI) has\nseen consistent growth over the past decade. This rise is attributed to major\nadvancements in statistical machine learning methodologies, computational\ncapabilities and the increased availability of large datasets. AI techniques\nare applied throughout different stages of drug development, ranging from drug\ndiscovery to post-marketing benefit-risk assessment. Kolluri et al. provided a\nreview of several case studies that span these stages, featuring key\napplications such as protein structure prediction, success probability\nestimation, subgroup identification, and AI-assisted clinical trial monitoring.\nFrom a regulatory standpoint, there was a notable uptick in submissions\nincorporating AI components in 2021. The most prevalent therapeutic areas\nleveraging AI were oncology (27%), psychiatry (15%), gastroenterology (12%),\nand neurology (11%). The paradigm of personalized or precision medicine has\ngained significant traction in recent research, partly due to advancements in\nAI techniques \\cite{hamburg2010path}. This shift has had a transformative\nimpact on the pharmaceutical industry. Departing from the traditional\n\"one-size-fits-all\" model, personalized medicine incorporates various\nindividual factors, such as environmental conditions, lifestyle choices, and\nhealth histories, to formulate customized treatment plans. By utilizing\nsophisticated machine learning algorithms, clinicians and researchers are\nbetter equipped to make informed decisions in areas such as disease prevention,\ndiagnosis, and treatment selection, thereby optimizing health outcomes for each\nindividual.\n","authors":["Yuhan Li","Hongtao Zhang","Keaven Anderson","Songzi Li","Ruoqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.18725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13316v2","updated":"2023-11-30T17:20:25Z","published":"2022-11-23T21:34:35Z","title":"Understanding Sample Generation Strategies for Learning Heuristic\n Functions in Classical Planning","summary":" We study the problem of learning good heuristic functions for classical\nplanning tasks with neural networks based on samples represented by states with\ntheir cost-to-goal estimates. The heuristic function is learned for a state\nspace and goal condition with the number of samples limited to a fraction of\nthe size of the state space, and must generalize well for all states of the\nstate space with the same goal condition. Our main goal is to better understand\nthe influence of sample generation strategies on the performance of a greedy\nbest-first heuristic search (GBFS) guided by a learned heuristic function. In a\nset of controlled experiments, we find that two main factors determine the\nquality of the learned heuristic: which states are included in the sample set\nand the quality of the cost-to-goal estimates. These two factors are dependent:\nhaving perfect cost-to-goal estimates is insufficient if the samples are not\nwell distributed across the state space. We also study other effects, such as\nadding samples with high-value estimates. Based on our findings, we propose\npractical strategies to improve the quality of learned heuristics: three\nstrategies that aim to generate more representative states and two strategies\nthat improve the cost-to-goal estimates. Our practical strategies almost double\nthe mean coverage of a GBFS algorithm guided by a learned heuristic.\n","authors":["R. V. Bettker","P. P. Minini","A. G. Pereira","M. Ritt"],"pdf_url":"https://arxiv.org/pdf/2211.13316v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2311.18718v1","updated":"2023-11-30T17:19:18Z","published":"2023-11-30T17:19:18Z","title":"Steering Deep Feature Learning with Backward Aligned Feature Updates","summary":" Deep learning succeeds by doing hierarchical feature learning, yet tuning\nHyper-Parameters (HP) such as initialization scales, learning rates etc., only\ngive indirect control over this behavior. In this paper, we propose the\nalignment between the feature updates and the backward pass as a key notion to\npredict, measure and control feature learning. On the one hand, we show that\nwhen alignment holds, the magnitude of feature updates after one SGD step is\nrelated to the magnitude of the forward and backward passes by a simple and\ngeneral formula. This leads to techniques to automatically adjust HPs\n(initialization scales and learning rates) at initialization and throughout\ntraining to attain a desired feature learning behavior. On the other hand, we\nshow that, at random initialization, this alignment is determined by the\nspectrum of a certain kernel, and that well-conditioned layer-to-layer\nJacobians (aka dynamical isometry) implies alignment. Finally, we investigate\nReLU MLPs and ResNets in the large width-then-depth limit. Combining hints from\nrandom matrix theory and numerical experiments, we show that (i) in MLP with\niid initializations, alignment degenerates with depth, making it impossible to\nstart training, and that (ii) in ResNets, the branch scale\n$1/\\sqrt{\\text{depth}}$ is the only one maintaining non-trivial alignment at\ninfinite depth.\n","authors":["Lénaïc Chizat","Praneeth Netrapalli"],"pdf_url":"https://arxiv.org/pdf/2311.18718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03170v2","updated":"2023-11-30T17:15:34Z","published":"2023-07-06T17:52:10Z","title":"Focused Transformer: Contrastive Training for Context Scaling","summary":" Large language models have an exceptional capability to incorporate new\ninformation in a contextual manner. However, the full potential of such an\napproach is often restrained due to a limitation in the effective context\nlength. One solution to this issue is to endow an attention layer with access\nto an external memory, which comprises of (key, value) pairs. Yet, as the\nnumber of documents increases, the proportion of relevant keys to irrelevant\nones decreases, leading the model to focus more on the irrelevant keys. We\nidentify a significant challenge, dubbed the distraction issue, where keys\nlinked to different semantic values might overlap, making them hard to\ndistinguish. To tackle this problem, we introduce the Focused Transformer\n(FoT), a technique that employs a training process inspired by contrastive\nlearning. This novel approach enhances the structure of the (key, value) space,\nenabling an extension of the context length. Our method allows for fine-tuning\npre-existing, large-scale models to lengthen their effective context. This is\ndemonstrated by our fine-tuning of $3B$ and $7B$ OpenLLaMA checkpoints. The\nresulting models, which we name LongLLaMA, exhibit advancements in tasks\nrequiring a long context. We further illustrate that our LongLLaMA models\nadeptly manage a $256 k$ context length for passkey retrieval.\n","authors":["Szymon Tworkowski","Konrad Staniszewski","Mikołaj Pacek","Yuhuai Wu","Henryk Michalewski","Piotr Miłoś"],"pdf_url":"https://arxiv.org/pdf/2307.03170v2.pdf","comment":"Accepted at 37th Conference on Neural Information Processing Systems\n (NeurIPS 2023). 28 pages, 10 figures, 11 tables"},{"id":"http://arxiv.org/abs/2308.00629v3","updated":"2023-11-30T17:07:05Z","published":"2023-08-01T15:56:24Z","title":"Hessian-Aware Bayesian Optimization for Decision Making Systems","summary":" Many approaches for optimizing decision making systems rely on gradient based\nmethods requiring informative feedback from the environment. However, in the\ncase where such feedback is sparse or uninformative, such approaches may result\nin poor performance. Derivative-free approaches such as Bayesian Optimization\nmitigate the dependency on the quality of gradient feedback, but are known to\nscale poorly in the high-dimension setting of complex decision making systems.\nThis problem is exacerbated if the system requires interactions between several\nactors cooperating to accomplish a shared goal. To address the dimensionality\nchallenge, we propose a compact multi-layered architecture modeling the\ndynamics of actor interactions through the concept of role. Additionally, we\nintroduce Hessian-aware Bayesian Optimization to efficiently optimize the\nmulti-layered architecture parameterized by a large number of parameters.\nExperimental results demonstrate that our method (HA-GP-UCB) works effectively\non several benchmarks under resource constraints and malformed feedback\nsettings.\n","authors":["Mohit Rajpal","Lac Gia Tran","Yehong Zhang","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2308.00629v3.pdf","comment":"Revision after ICLR feedback"},{"id":"http://arxiv.org/abs/2311.18710v1","updated":"2023-11-30T17:02:27Z","published":"2023-11-30T17:02:27Z","title":"Meta-Prior: Meta learning for Adaptive Inverse Problem Solvers","summary":" Deep neural networks have become a foundational tool for addressing imaging\ninverse problems. They are typically trained for a specific task, with a\nsupervised loss to learn a mapping from the observations to the image to\nrecover. However, real-world imaging challenges often lack ground truth data,\nrendering traditional supervised approaches ineffective. Moreover, for each new\nimaging task, a new model needs to be trained from scratch, wasting time and\nresources. To overcome these limitations, we introduce a novel approach based\non meta-learning. Our method trains a meta-model on a diverse set of imaging\ntasks that allows the model to be efficiently fine-tuned for specific tasks\nwith few fine-tuning steps. We show that the proposed method extends to the\nunsupervised setting, where no ground truth data is available. In its bilevel\nformulation, the outer level uses a supervised loss, that evaluates how well\nthe fine-tuned model performs, while the inner loss can be either supervised or\nunsupervised, relying only on the measurement operator. This allows the\nmeta-model to leverage a few ground truth samples for each task while being\nable to generalize to new imaging tasks. We show that in simple settings, this\napproach recovers the Bayes optimal estimator, illustrating the soundness of\nour approach. We also demonstrate our method's effectiveness on various tasks,\nincluding image processing and magnetic resonance imaging.\n","authors":["Matthieu Terris","Thomas Moreau"],"pdf_url":"https://arxiv.org/pdf/2311.18710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18703v1","updated":"2023-11-30T16:53:32Z","published":"2023-11-30T16:53:32Z","title":"Predictable Reinforcement Learning Dynamics through Entropy Rate\n Minimization","summary":" In Reinforcement Learning (RL), agents have no incentive to exhibit\npredictable behaviors, and are often pushed (through e.g. policy entropy\nregularization) to randomize their actions in favor of exploration. From a\nhuman perspective, this makes RL agents hard to interpret and predict, and from\na safety perspective, even harder to formally verify. We propose a novel method\nto induce predictable behavior in RL agents, referred to as\nPredictability-Aware RL (PA-RL), which employs the state sequence entropy rate\nas a predictability measure. We show how the entropy rate can be formulated as\nan average reward objective, and since its entropy reward function is\npolicy-dependent, we introduce an action-dependent surrogate entropy enabling\nthe use of PG methods. We prove that deterministic policies minimizing the\naverage surrogate reward exist and also minimize the actual entropy rate, and\nshow how, given a learned dynamical model, we are able to approximate the value\nfunction associated to the true entropy rate. Finally, we demonstrate the\neffectiveness of the approach in RL tasks inspired by human-robot use-cases,\nand show how it produces agents with more predictable behavior while achieving\nnear-optimal rewards.\n","authors":["Daniel Jarne Ornia","Giannis Delimpaltadakis","Jens Kober","Javier Alonso-Mora"],"pdf_url":"https://arxiv.org/pdf/2311.18703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01923v2","updated":"2023-11-30T16:50:05Z","published":"2023-03-03T13:48:35Z","title":"Bayesian CART models for insurance claims frequency","summary":" Accuracy and interpretability of a (non-life) insurance pricing model are\nessential qualities to ensure fair and transparent premiums for policy-holders,\nthat reflect their risk. In recent years, the classification and regression\ntrees (CARTs) and their ensembles have gained popularity in the actuarial\nliterature, since they offer good prediction performance and are relatively\neasily interpretable. In this paper, we introduce Bayesian CART models for\ninsurance pricing, with a particular focus on claims frequency modelling.\nAdditionally to the common Poisson and negative binomial (NB) distributions\nused for claims frequency, we implement Bayesian CART for the zero-inflated\nPoisson (ZIP) distribution to address the difficulty arising from the\nimbalanced insurance claims data. To this end, we introduce a general MCMC\nalgorithm using data augmentation methods for posterior tree exploration. We\nalso introduce the deviance information criterion (DIC) for the tree model\nselection. The proposed models are able to identify trees which can better\nclassify the policy-holders into risk groups. Some simulations and real\ninsurance data will be discussed to illustrate the applicability of these\nmodels.\n","authors":["Yaojun Zhang","Lanpeng Ji","Georgios Aivaliotis","Charles Taylor"],"pdf_url":"https://arxiv.org/pdf/2303.01923v2.pdf","comment":"46 pages"},{"id":"http://arxiv.org/abs/2311.18695v1","updated":"2023-11-30T16:42:24Z","published":"2023-11-30T16:42:24Z","title":"Seg2Reg: Differentiable 2D Segmentation to 1D Regression Rendering for\n 360 Room Layout Reconstruction","summary":" State-of-the-art single-view 360-degree room layout reconstruction methods\nformulate the problem as a high-level 1D (per-column) regression task. On the\nother hand, traditional low-level 2D layout segmentation is simpler to learn\nand can represent occluded regions, but it requires complex post-processing for\nthe targeting layout polygon and sacrifices accuracy. We present Seg2Reg to\nrender 1D layout depth regression from the 2D segmentation map in a\ndifferentiable and occlusion-aware way, marrying the merits of both sides.\nSpecifically, our model predicts floor-plan density for the input\nequirectangular 360-degree image. Formulating the 2D layout representation as a\ndensity field enables us to employ `flattened' volume rendering to form 1D\nlayout depth regression. In addition, we propose a novel 3D warping\naugmentation on layout to improve generalization. Finally, we re-implement\nrecent room layout reconstruction methods into our codebase for benchmarking\nand explore modern backbones and training techniques to serve as the strong\nbaseline. Our model significantly outperforms previous arts. The code will be\nmade available upon publication.\n","authors":["Cheng Sun","Wei-En Tai","Yu-Lin Shih","Kuan-Wei Chen","Yong-Jing Syu","Kent Selwyn The","Yu-Chiang Frank Wang","Hwann-Tzong Chen"],"pdf_url":"https://arxiv.org/pdf/2311.18695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18694v1","updated":"2023-11-30T16:39:46Z","published":"2023-11-30T16:39:46Z","title":"Balancing Summarization and Change Detection in Graph Streams","summary":" This study addresses the issue of balancing graph summarization and graph\nchange detection. Graph summarization compresses large-scale graphs into a\nsmaller scale. However, the question remains: To what extent should the\noriginal graph be compressed? This problem is solved from the perspective of\ngraph change detection, aiming to detect statistically significant changes\nusing a stream of summary graphs. If the compression rate is extremely high,\nimportant changes can be ignored, whereas if the compression rate is extremely\nlow, false alarms may increase with more memory. This implies that there is a\ntrade-off between compression rate in graph summarization and accuracy in\nchange detection. We propose a novel quantitative methodology to balance this\ntrade-off to simultaneously realize reliable graph summarization and change\ndetection. We introduce a probabilistic structure of hierarchical latent\nvariable model into a graph, thereby designing a parameterized summary graph on\nthe basis of the minimum description length principle. The parameter specifying\nthe summary graph is then optimized so that the accuracy of change detection is\nguaranteed to suppress Type I error probability (probability of raising false\nalarms) to be less than a given confidence level. First, we provide a\ntheoretical framework for connecting graph summarization with change detection.\nThen, we empirically demonstrate its effectiveness on synthetic and real\ndatasets.\n","authors":["Shintaro Fukushima","Kenji Yamanishi"],"pdf_url":"https://arxiv.org/pdf/2311.18694v1.pdf","comment":"6 pages, Accepted to 23rd IEEE International Conference on Data\n Mining (ICDM2023)"},{"id":"http://arxiv.org/abs/2311.18684v1","updated":"2023-11-30T16:31:04Z","published":"2023-11-30T16:31:04Z","title":"Handling Cost and Constraints with Off-Policy Deep Reinforcement\n Learning","summary":" By reusing data throughout training, off-policy deep reinforcement learning\nalgorithms offer improved sample efficiency relative to on-policy approaches.\nFor continuous action spaces, the most popular methods for off-policy learning\ninclude policy improvement steps where a learned state-action ($Q$) value\nfunction is maximized over selected batches of data. These updates are often\npaired with regularization to combat associated overestimation of $Q$ values.\nWith an eye toward safety, we revisit this strategy in environments with\n\"mixed-sign\" reward functions; that is, with reward functions that include\nindependent positive (incentive) and negative (cost) terms. This setting is\ncommon in real-world applications, and may be addressed with or without\nconstraints on the cost terms. We find the combination of function\napproximation and a term that maximizes $Q$ in the policy update to be\nproblematic in such environments, because systematic errors in value estimation\nimpact the contributions from the competing terms asymmetrically. This results\nin overemphasis of either incentives or costs and may severely limit learning.\nWe explore two remedies to this issue. First, consistent with prior work, we\nfind that periodic resetting of $Q$ and policy networks can be used to reduce\nvalue estimation error and improve learning in this setting. Second, we\nformulate novel off-policy actor-critic methods for both unconstrained and\nconstrained learning that do not explicitly maximize $Q$ in the policy update.\nWe find that this second approach, when applied to continuous action spaces\nwith mixed-sign rewards, consistently and significantly outperforms\nstate-of-the-art methods augmented by resetting. We further find that our\napproach produces agents that are both competitive with popular methods overall\nand more reliably competent on frequently-studied control problems that do not\nhave mixed-sign rewards.\n","authors":["Jared Markowitz","Jesse Silverberg","Gary Collins"],"pdf_url":"https://arxiv.org/pdf/2311.18684v1.pdf","comment":"22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2302.10886v3","updated":"2023-11-30T16:27:46Z","published":"2023-02-21T18:59:40Z","title":"Some Intriguing Aspects about Lipschitz Continuity of Neural Networks","summary":" Lipschitz continuity is a crucial functional property of any predictive\nmodel, that naturally governs its robustness, generalisation, as well as\nadversarial vulnerability. Contrary to other works that focus on obtaining\ntighter bounds and developing different practical strategies to enforce certain\nLipschitz properties, we aim to thoroughly examine and characterise the\nLipschitz behaviour of Neural Networks. Thus, we carry out an empirical\ninvestigation in a range of different settings (namely, architectures,\ndatasets, label noise, and more) by exhausting the limits of the simplest and\nthe most general lower and upper bounds. As a highlight of this investigation,\nwe showcase a remarkable fidelity of the lower Lipschitz bound, identify a\nstriking Double Descent trend in both upper and lower bounds to the Lipschitz\nand explain the intriguing effects of label noise on function smoothness and\ngeneralisation.\n","authors":["Grigory Khromov","Sidak Pal Singh"],"pdf_url":"https://arxiv.org/pdf/2302.10886v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18672v1","updated":"2023-11-30T16:19:13Z","published":"2023-11-30T16:19:13Z","title":"A Comparison Between Invariant and Equivariant Classical and Quantum\n Graph Neural Networks","summary":" Machine learning algorithms are heavily relied on to understand the vast\namounts of data from high-energy particle collisions at the CERN Large Hadron\nCollider (LHC). The data from such collision events can naturally be\nrepresented with graph structures. Therefore, deep geometric methods, such as\ngraph neural networks (GNNs), have been leveraged for various data analysis\ntasks in high-energy physics. One typical task is jet tagging, where jets are\nviewed as point clouds with distinct features and edge connections between\ntheir constituent particles. The increasing size and complexity of the LHC\nparticle datasets, as well as the computational models used for their analysis,\ngreatly motivate the development of alternative fast and efficient\ncomputational paradigms such as quantum computation. In addition, to enhance\nthe validity and robustness of deep networks, one can leverage the fundamental\nsymmetries present in the data through the use of invariant inputs and\nequivariant layers. In this paper, we perform a fair and comprehensive\ncomparison between classical graph neural networks (GNNs) and equivariant graph\nneural networks (EGNNs) and their quantum counterparts: quantum graph neural\nnetworks (QGNNs) and equivariant quantum graph neural networks (EQGNN). The\nfour architectures were benchmarked on a binary classification task to classify\nthe parton-level particle initiating the jet. Based on their AUC scores, the\nquantum networks were shown to outperform the classical networks. However,\nseeing the computational advantage of the quantum networks in practice may have\nto wait for the further development of quantum technology and its associated\nAPIs.\n","authors":["Roy T. Forestano","Marçal Comajoan Cara","Gopal Ramesh Dahale","Zhongtian Dong","Sergei Gleyzer","Daniel Justice","Kyoungchul Kong","Tom Magorsch","Konstantin T. Matchev","Katia Matcheva","Eyup B. Unlu"],"pdf_url":"https://arxiv.org/pdf/2311.18672v1.pdf","comment":"14 pages, 7 figures, 3 appendices"},{"id":"http://arxiv.org/abs/2311.18663v1","updated":"2023-11-30T16:11:12Z","published":"2023-11-30T16:11:12Z","title":"Choosing the parameter of the Fermat distance: navigating geometry and\n noise","summary":" The Fermat distance has been recently established as a useful tool for\nmachine learning tasks when a natural distance is not directly available to the\npractitioner or to improve the results given by Euclidean distances by\nexploding the geometrical and statistical properties of the dataset. This\ndistance depends on a parameter $\\alpha$ that greatly impacts the performance\nof subsequent tasks. Ideally, the value of $\\alpha$ should be large enough to\nnavigate the geometric intricacies inherent to the problem. At the same, it\nshould remain restrained enough to sidestep any deleterious ramifications\nstemming from noise during the process of distance estimation. We study both\ntheoretically and through simulations how to select this parameter.\n","authors":["Frédéric Chazal","Laure Ferraris","Pablo Groisman","Matthieu Jonckheere","Frédéric Pascal","Facundo Sapienza"],"pdf_url":"https://arxiv.org/pdf/2311.18663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18498v2","updated":"2023-11-30T16:08:54Z","published":"2023-05-29T14:19:40Z","title":"ANPL: Towards Natural Programming with Interactive Decomposition","summary":" Though LLMs are capable of generating plausible programs, it's challenging to\ninteract with the LLMs further to revise the program, especially if the user's\nspecific requirements are different from the initial proposal. In this paper,\nwe introduce ANPL, an interactive programming system that ensures users can\nalways refine the generated code towards their specific programmatic intents\nvia structured decompositions. Borrowing the paradigm of sketching from program\nsynthesis, an ANPL program consists of a set of input-outputs that it must\nsatisfy, a ``sketch'' -- control/data flow expressed in precise code (e.g.\nPython), and ``holes'' -- sub-modules to be implemented by the LLM specified\nwith natural language. The user revises an ANPL program by either modifying the\nsketch, changing the language used to describe the holes, or providing\nadditional input-outputs to a particular hole, turning it into a sub-ANPL\nprogram that can be solved recursively. This workflow allows the users to\noffload programming burdens to the LLM as much as possible while retaining the\nability to pinpoint and resolve bugs locally, without exposing the rest of the\nprogram to the LLM. We deploy ANPL on the Abstraction and Reasoning Corpus\n(ARC), a set of unique tasks that are challenging for state-of-the-art AI\nsystems, showing it outperforms baseline programming systems that (a) without\nthe ability to decompose tasks interactively and (b) without the guarantee that\nthe modules can be correctly composed together. Additional evaluations on APPS,\nHumanEval, and real-world programming tasks have validated that the ANPL\nframework is applicable to multiple programming domains. We release the ANPL\nsolutions to the ARC tasks as a dataset, providing insights into how humans\ndecompose novel tasks programmatically. See our code at\nhttps://iprc-dip.github.io/ANPL/.\n","authors":["Di Huang","Ziyuan Nan","Xing Hu","Pengwei Jin","Shaohui Peng","Yuanbo Wen","Rui Zhang","Zidong Du","Qi Guo","Yewen Pu","Yunji Chen"],"pdf_url":"https://arxiv.org/pdf/2305.18498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03915v3","updated":"2023-11-30T15:50:46Z","published":"2023-10-05T21:44:18Z","title":"Leveraging Low-Rank and Sparse Recurrent Connectivity for Robust\n Closed-Loop Control","summary":" Developing autonomous agents that can interact with changing environments is\nan open challenge in machine learning. Robustness is particularly important in\nthese settings as agents are often fit offline on expert demonstrations but\ndeployed online where they must generalize to the closed feedback loop within\nthe environment. In this work, we explore the application of recurrent neural\nnetworks to tasks of this nature and understand how a parameterization of their\nrecurrent connectivity influences robustness in closed-loop settings.\nSpecifically, we represent the recurrent connectivity as a function of rank and\nsparsity and show both theoretically and empirically that modulating these two\nvariables has desirable effects on network dynamics. The proposed low-rank,\nsparse connectivity induces an interpretable prior on the network that proves\nto be most amenable for a class of models known as closed-form continuous-time\nneural networks (CfCs). We find that CfCs with fewer parameters can outperform\ntheir full-rank, fully-connected counterparts in the online setting under\ndistribution shift. This yields memory-efficient and robust agents while\nopening a new perspective on how we can modulate network dynamics through\nconnectivity.\n","authors":["Neehal Tumma","Mathias Lechner","Noel Loo","Ramin Hasani","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2310.03915v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18639v1","updated":"2023-11-30T15:46:22Z","published":"2023-11-30T15:46:22Z","title":"Targeted Reduction of Causal Models","summary":" Why does a phenomenon occur? Addressing this question is central to most\nscientific inquiries based on empirical observations, and often heavily relies\non simulations of scientific models. As models become more intricate,\ndeciphering the causes behind these phenomena in high-dimensional spaces of\ninterconnected variables becomes increasingly challenging. Causal machine\nlearning may assist scientists in the discovery of relevant and interpretable\npatterns of causation in simulations. We introduce Targeted Causal Reduction\n(TCR), a method for turning complex models into a concise set of causal factors\nthat explain a specific target phenomenon. We derive an information theoretic\nobjective to learn TCR from interventional data or simulations and propose\nalgorithms to optimize this objective efficiently. TCR's ability to generate\ninterpretable high-level explanations from complex models is demonstrated on\ntoy and mechanical systems, illustrating its potential to assist scientists in\nthe study of complex phenomena in a broad range of disciplines.\n","authors":["Armin Kekić","Bernhard Schölkopf","Michel Besserve"],"pdf_url":"https://arxiv.org/pdf/2311.18639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08018v4","updated":"2023-11-30T15:29:58Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a comprehensive instruction dataset\ndesigned for the biomolecular domain. Mol-Instructions encompasses three key\ncomponents: molecule-oriented instructions, protein-oriented instructions, and\nbiomolecular text instructions. Each component aims to improve the\nunderstanding and prediction capabilities of LLMs concerning biomolecular\nfeatures and behaviors. Through extensive instruction tuning experiments on\nLLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large\nmodels' performance in the intricate realm of biomolecular studies, thus\nfostering progress in the biomolecular research community. Mol-Instructions is\npublicly available for ongoing research and will undergo regular updates to\nenhance its applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v4.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions, add\n more experiments"},{"id":"http://arxiv.org/abs/2308.12114v3","updated":"2023-11-30T15:26:54Z","published":"2023-08-23T13:09:03Z","title":"Less is More -- Towards parsimonious multi-task models using structured\n sparsity","summary":" Model sparsification in deep learning promotes simpler, more interpretable\nmodels with fewer parameters. This not only reduces the model's memory\nfootprint and computational needs but also shortens inference time. This work\nfocuses on creating sparse models optimized for multiple tasks with fewer\nparameters. These parsimonious models also possess the potential to match or\noutperform dense models in terms of performance. In this work, we introduce\nchannel-wise l1/l2 group sparsity in the shared convolutional layers parameters\n(or weights) of the multi-task learning model. This approach facilitates the\nremoval of extraneous groups i.e., channels (due to l1 regularization) and also\nimposes a penalty on the weights, further enhancing the learning efficiency for\nall tasks (due to l2 regularization). We analyzed the results of group sparsity\nin both single-task and multi-task settings on two widely-used Multi-Task\nLearning (MTL) datasets: NYU-v2 and CelebAMask-HQ. On both datasets, which\nconsist of three different computer vision tasks each, multi-task models with\napproximately 70% sparsity outperform their dense equivalents. We also\ninvestigate how changing the degree of sparsification influences the model's\nperformance, the overall sparsity percentage, the patterns of sparsity, and the\ninference time.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.12114v3.pdf","comment":"accepted at First Conference on Parsimony and Learning (CPAL 2024)"},{"id":"http://arxiv.org/abs/2311.15414v2","updated":"2023-11-30T15:26:20Z","published":"2023-11-26T20:35:19Z","title":"KOPPA: Improving Prompt-based Continual Learning with Key-Query\n Orthogonal Projection and Prototype-based One-Versus-All","summary":" Drawing inspiration from prompt tuning techniques applied to Large Language\nModels, recent methods based on pre-trained ViT networks have achieved\nremarkable results in the field of Continual Learning. Specifically, these\napproaches propose to maintain a set of prompts and allocate a subset of them\nto learn each task using a key-query matching strategy. However, they may\nencounter limitations when lacking control over the correlations between old\ntask queries and keys of future tasks, the shift of features in the latent\nspace, and the relative separation of latent vectors learned in independent\ntasks. In this work, we introduce a novel key-query learning strategy based on\northogonal projection, inspired by model-agnostic meta-learning, to enhance\nprompt matching efficiency and address the challenge of shifting features.\nFurthermore, we introduce a One-Versus-All (OVA) prototype-based component that\nenhances the classification head distinction. Experimental results on benchmark\ndatasets demonstrate that our method empowers the model to achieve results\nsurpassing those of current state-of-the-art approaches by a large margin of up\nto 20%.\n","authors":["Quyen Tran","Lam Tran","Khoat Than","Toan Tran","Dinh Phung","Trung Le"],"pdf_url":"https://arxiv.org/pdf/2311.15414v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.00599v2","updated":"2023-11-30T15:16:06Z","published":"2021-06-01T16:07:50Z","title":"ClustML: A Measure of Cluster Pattern Complexity in Scatterplots Learnt\n from Human-labeled Groupings","summary":" Visual quality measures (VQMs) are designed to support analysts by\nautomatically detecting and quantifying patterns in visualizations. We propose\na new VQM for visual grouping patterns in scatterplots, called ClustML, which\nis trained on previously collected human subject judgments. Our model encodes\nscatterplots in the parametric space of a Gaussian Mixture Model and uses a\nclassifier trained on human judgment data to estimate the perceptual complexity\nof grouping patterns. The numbers of initial mixture components and final\ncombined groups. It improves on existing VQMs, first, by better estimating\nhuman judgments on two-Gaussian cluster patterns and, second, by giving higher\naccuracy when ranking general cluster patterns in scatterplots. We use it to\nanalyze kinship data for genome-wide association studies, in which experts rely\non the visual analysis of large sets of scatterplots. We make the benchmark\ndatasets and the new VQM available for practical use and further improvements.\n","authors":["Mostafa M. Abbas","Ehsan Ullah","Abdelkader Baggag","Halima Bensmail","Michael Sedlmair","Michaël Aupetit"],"pdf_url":"https://arxiv.org/pdf/2106.00599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05566v2","updated":"2023-11-30T15:16:00Z","published":"2023-10-09T09:43:08Z","title":"Aggregated f-average Neural Network for Interpretable Ensembling","summary":" Ensemble learning leverages multiple models (i.e., weak learners) on a common\nmachine learning task to enhance prediction performance. Basic ensembling\napproaches average the weak learners outputs, while more sophisticated ones\nstack a machine learning model in between the weak learners outputs and the\nfinal prediction. This work fuses both aforementioned frameworks. We introduce\nan aggregated f-average (AFA) shallow neural network which models and combines\ndifferent types of averages to perform an optimal aggregation of the weak\nlearners predictions. We emphasise its interpretable architecture and simple\ntraining strategy, and illustrate its good performance on the problem of\nfew-shot class incremental learning.\n","authors":["Mathieu Vu","Emilie Chouzenoux","Jean-Christophe Pesquet","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2310.05566v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2311.18608v1","updated":"2023-11-30T15:06:10Z","published":"2023-11-30T15:06:10Z","title":"Contrastive Denoising Score for Text-guided Latent Diffusion Image\n Editing","summary":" With the remarkable advent of text-to-image diffusion models, image editing\nmethods have become more diverse and continue to evolve. A promising recent\napproach in this realm is Delta Denoising Score (DDS) - an image editing\ntechnique based on Score Distillation Sampling (SDS) framework that leverages\nthe rich generative prior of text-to-image diffusion models. However, relying\nsolely on the difference between scoring functions is insufficient for\npreserving specific structural elements from the original image, a crucial\naspect of image editing. Inspired by the similarity and importance differences\nbetween DDS and the contrastive learning for unpaired image-to-image\ntranslation (CUT), here we present an embarrassingly simple yet very powerful\nmodification of DDS, called Contrastive Denoising Score (CDS), for latent\ndiffusion models (LDM). Specifically, to enforce structural correspondence\nbetween the input and output while maintaining the controllability of contents,\nwe introduce a straightforward approach to regulate structural consistency\nusing CUT loss within the DDS framework. To calculate this loss, instead of\nemploying auxiliary networks, we utilize the intermediate features of LDM, in\nparticular, those from the self-attention layers, which possesses rich spatial\ninformation. Our approach enables zero-shot image-to-image translation and\nneural radiance field (NeRF) editing, achieving a well-balanced interplay\nbetween maintaining the structural details and transforming content.\nQualitative results and comparisons demonstrates the effectiveness of our\nproposed method. Project page with code is available at\nhttps://hyelinnam.github.io/CDS/.\n","authors":["Hyelin Nam","Gihyun Kwon","Geon Yeong Park","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.18608v1.pdf","comment":"Project page: https://hyelinnam.github.io/CDS/"},{"id":"http://arxiv.org/abs/2305.16526v2","updated":"2023-11-30T14:57:59Z","published":"2023-05-25T23:07:43Z","title":"Extending Explainable Boosting Machines to Scientific Image Data","summary":" As the deployment of computer vision technology becomes increasingly common\nin science, the need for explanations of the system and its output has become a\nfocus of great concern. Driven by the pressing need for interpretable models in\nscience, we propose the use of Explainable Boosting Machines (EBMs) for\nscientific image data. Inspired by an important application underpinning the\ndevelopment of quantum technologies, we apply EBMs to cold-atom soliton image\ndata tabularized using Gabor Wavelet Transform-based techniques that preserve\nthe spatial structure of the data. In doing so, we demonstrate the use of EBMs\nfor image data for the first time and show that our approach provides\nexplanations that are consistent with human intuition about the data.\n","authors":["Daniel Schug","Sai Yerramreddy","Rich Caruana","Craig Greenberg","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2305.16526v2.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.18598v1","updated":"2023-11-30T14:45:51Z","published":"2023-11-30T14:45:51Z","title":"Generalisable Agents for Neural Network Optimisation","summary":" Optimising deep neural networks is a challenging task due to complex training\ndynamics, high computational requirements, and long training times. To address\nthis difficulty, we propose the framework of Generalisable Agents for Neural\nNetwork Optimisation (GANNO) -- a multi-agent reinforcement learning (MARL)\napproach that learns to improve neural network optimisation by dynamically and\nresponsively scheduling hyperparameters during training. GANNO utilises an\nagent per layer that observes localised network dynamics and accordingly takes\nactions to adjust these dynamics at a layerwise level to collectively improve\nglobal performance. In this paper, we use GANNO to control the layerwise\nlearning rate and show that the framework can yield useful and responsive\nschedules that are competitive with handcrafted heuristics. Furthermore, GANNO\nis shown to perform robustly across a wide variety of unseen initial\nconditions, and can successfully generalise to harder problems than it was\ntrained on. Our work presents an overview of the opportunities that this\nparadigm offers for training neural networks, along with key challenges that\nremain to be overcome.\n","authors":["Kale-ab Tessera","Callum Rhys Tilbury","Sasha Abramowitz","Ruan de Kock","Omayma Mahjoub","Benjamin Rosman","Sara Hooker","Arnu Pretorius"],"pdf_url":"https://arxiv.org/pdf/2311.18598v1.pdf","comment":"Accepted at the Workshop on Advanced Neural Network Training (WANT)\n and Optimization for Machine Learning (OPT) at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.11526v3","updated":"2023-11-30T14:37:16Z","published":"2023-09-20T06:55:39Z","title":"Likelihood-based Sensor Calibration using Affine Transformation","summary":" An important task in the field of sensor technology is the efficient\nimplementation of adaptation procedures of measurements from one sensor to\nanother sensor of identical design. One idea is to use the estimation of an\naffine transformation between different systems, which can be improved by the\nknowledge of experts. This paper presents an improved solution from Glacier\nResearch that was published back in 1973. The results demonstrate the\nadaptability of this solution for various applications, including software\ncalibration of sensors, implementation of expert-based adaptation, and paving\nthe way for future advancements such as distributed learning methods. One idea\nhere is to use the knowledge of experts for estimating an affine transformation\nbetween different systems. We evaluate our research with simulations and also\nwith real measured data of a multi-sensor board with 8 identical sensors. Both\ndata set and evaluation script are provided for download. The results show an\nimprovement for both the simulation and the experiments with real data.\n","authors":["Rüdiger Machhamer","Lejla Begic Fazlic","Eray Guven","David Junk","Gunes Karabulut Kurt","Stefan Naumann","Stephan Didas","Klaus-Uwe Gollmer","Ralph Bergmann","Ingo J. Timm","Guido Dartmann"],"pdf_url":"https://arxiv.org/pdf/2309.11526v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18588v1","updated":"2023-11-30T14:29:18Z","published":"2023-11-30T14:29:18Z","title":"Optimizing ZX-Diagrams with Deep Reinforcement Learning","summary":" ZX-diagrams are a powerful graphical language for the description of quantum\nprocesses with applications in fundamental quantum mechanics, quantum circuit\noptimization, tensor network simulation, and many more. The utility of\nZX-diagrams relies on a set of local transformation rules that can be applied\nto them without changing the underlying quantum process they describe. These\nrules can be exploited to optimize the structure of ZX-diagrams for a range of\napplications. However, finding an optimal sequence of transformation rules is\ngenerally an open problem. In this work, we bring together ZX-diagrams with\nreinforcement learning, a machine learning technique designed to discover an\noptimal sequence of actions in a decision-making problem and show that a\ntrained reinforcement learning agent can significantly outperform other\noptimization techniques like a greedy strategy or simulated annealing. The use\nof graph neural networks to encode the policy of the agent enables\ngeneralization to diagrams much bigger than seen during the training phase.\n","authors":["Maximilian Nägele","Florian Marquardt"],"pdf_url":"https://arxiv.org/pdf/2311.18588v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.18587v1","updated":"2023-11-30T14:28:25Z","published":"2023-11-30T14:28:25Z","title":"Continuous 16-bit Training: Accelerating 32-bit Pre-Trained Neural\n Networks","summary":" In the field of deep learning, the prevalence of models initially trained\nwith 32-bit precision is a testament to its robustness and accuracy. However,\nthe continuous evolution of these models often demands further training, which\ncan be resource-intensive. This study introduces a novel approach where we\ncontinue the training of these pre-existing 32-bit models using 16-bit\nprecision. This technique not only caters to the need for efficiency in\ncomputational resources but also significantly improves the speed of additional\ntraining phases. By adopting 16-bit precision for ongoing training, we are able\nto substantially decrease memory requirements and computational burden, thereby\naccelerating the training process in a resource-limited setting. Our\nexperiments show that this method maintains the high standards of accuracy set\nby the original 32-bit training while providing a much-needed boost in training\nspeed. This approach is especially pertinent in today's context, where most\nmodels are initially trained in 32-bit and require periodic updates and\nrefinements. The findings from our research suggest that this strategy of\n16-bit continuation training can be a key solution for sustainable and\nefficient deep learning, offering a practical way to enhance pre-trained models\nrapidly and in a resource-conscious manner.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2311.18587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18578v1","updated":"2023-11-30T14:17:57Z","published":"2023-11-30T14:17:57Z","title":"Communication-Efficient Heterogeneous Federated Learning with\n Generalized Heavy-Ball Momentum","summary":" Federated Learning (FL) is the state-of-the-art approach for learning from\ndecentralized data in privacy-constrained scenarios. As the current literature\nreports, the main problems associated with FL refer to system and statistical\nchallenges: the former ones demand for efficient learning from edge devices,\nincluding lowering communication bandwidth and frequency, while the latter\nrequire algorithms robust to non-iidness. State-of-art approaches either\nguarantee convergence at increased communication cost or are not sufficiently\nrobust to handle extreme heterogeneous local distributions. In this work we\npropose a novel generalization of the heavy-ball momentum, and present FedHBM\nto effectively address statistical heterogeneity in FL without introducing any\ncommunication overhead. We conduct extensive experimentation on common FL\nvision and NLP datasets, showing that our FedHBM algorithm empirically yields\nbetter model quality and higher convergence speed w.r.t. the state-of-art,\nespecially in pathological non-iid scenarios. While being designed for\ncross-silo settings, we show how FedHBM is applicable in moderate-to-high\ncross-device scenarios, and how good model initializations (e.g. pre-training)\ncan be exploited for prompt acceleration. Extended experimentation on\nlarge-scale real-world federated datasets further corroborates the\neffectiveness of our approach for real-world FL applications.\n","authors":["Riccardo Zaccone","Carlo Masone","Marco Ciccone"],"pdf_url":"https://arxiv.org/pdf/2311.18578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18575v1","updated":"2023-11-30T14:14:31Z","published":"2023-11-30T14:14:31Z","title":"Class Distribution Shifts in Zero-Shot Learning: Learning Robust\n Representations","summary":" Distribution shifts between training and deployment data often affect the\nperformance of machine learning models. In this paper, we explore a setting\nwhere a hidden variable induces a shift in the distribution of classes. These\ndistribution shifts are particularly challenging for zero-shot classifiers, as\nthey rely on representations learned from training classes, but are deployed on\nnew, unseen ones. We introduce an algorithm to learn data representations that\nare robust to such class distribution shifts in zero-shot verification tasks.\nWe show that our approach, which combines hierarchical data sampling with\nout-of-distribution generalization techniques, improves generalization to\ndiverse class distributions in both simulations and real-world datasets.\n","authors":["Yuli Slavutsky","Yuval Benjamini"],"pdf_url":"https://arxiv.org/pdf/2311.18575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18574v1","updated":"2023-11-30T14:09:20Z","published":"2023-11-30T14:09:20Z","title":"Multi-scale Iterative Refinement towards Robust and Versatile Molecular\n Docking","summary":" Molecular docking is a key computational tool utilized to predict the binding\nconformations of small molecules to protein targets, which is fundamental in\nthe design of novel drugs. Despite recent advancements in geometric deep\nlearning-based approaches leading to improvements in blind docking efficiency,\nthese methods have encountered notable challenges, such as limited\ngeneralization performance on unseen proteins, the inability to concurrently\naddress the settings of blind docking and site-specific docking, and the\nfrequent occurrence of physical implausibilities such as inter-molecular steric\nclash. In this study, we introduce DeltaDock, a robust and versatile framework\ndesigned for efficient molecular docking to overcome these challenges.\nDeltaDock operates in a two-step process: rapid initial complex structures\nsampling followed by multi-scale iterative refinement of the initial\nstructures. In the initial stage, to sample accurate structures with high\nefficiency, we develop a ligand-dependent binding site prediction model founded\non large protein models and graph neural networks. This model is then paired\nwith GPU-accelerated sampling algorithms. The sampled structures are updated\nusing a multi-scale iterative refinement module that captures both\nprotein-ligand atom-atom interactions and residue-atom interactions in the\nfollowing stage. Distinct from previous geometric deep learning methods that\nare conditioned on the blind docking setting, DeltaDock demonstrates superior\nperformance in both blind docking and site-specific docking settings.\nComprehensive experimental results reveal that DeltaDock consistently surpasses\nbaseline methods in terms of docking accuracy. Furthermore, it displays\nremarkable generalization capabilities and proficiency for predicting\nphysically valid structures, thereby attesting to its robustness and\nreliability in various scenarios.\n","authors":["Jiaxian Yan","Zaixi Zhang","Kai Zhang","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18574v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.15613v3","updated":"2023-11-30T14:04:57Z","published":"2023-05-24T23:04:34Z","title":"Learning Deep O($n$)-Equivariant Hyperspheres","summary":" This paper presents an approach to learning (deep) $n$D features equivariant\nunder orthogonal transformations, utilizing hyperspheres and regular\n$n$-simplexes. Our main contributions are theoretical and tackle major\nchallenges in geometric deep learning such as equivariance and invariance under\ngeometric transformations. Namely, we enrich the recently developed theory of\nsteerable 3D spherical neurons -- SO(3)-equivariant filter banks based on\nneurons with spherical decision surfaces -- by extending said neurons to $n$D,\nwhich we call deep equivariant hyperspheres, and enabling their multi-layer\nconstruction. Using synthetic and real-world data in $n$D, we experimentally\nverify our theoretical contributions and find that our approach is superior to\nthe competing methods for benchmark datasets in all but one case, additionally\ndemonstrating a better speed/performance trade-off in all but one other case.\n","authors":["Pavlo Melnyk","Michael Felsberg","Mårten Wadenbäck","Andreas Robinson","Cuong Le"],"pdf_url":"https://arxiv.org/pdf/2305.15613v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07266v2","updated":"2023-11-30T13:59:53Z","published":"2023-06-12T17:52:39Z","title":"Operator Learning with Neural Fields: Tackling PDEs on General\n Geometries","summary":" Machine learning approaches for solving partial differential equations\nrequire learning mappings between function spaces. While convolutional or graph\nneural networks are constrained to discretized functions, neural operators\npresent a promising milestone toward mapping functions directly. Despite\nimpressive results they still face challenges with respect to the domain\ngeometry and typically rely on some form of discretization. In order to\nalleviate such limitations, we present CORAL, a new method that leverages\ncoordinate-based networks for solving PDEs on general geometries. CORAL is\ndesigned to remove constraints on the input mesh, making it applicable to any\nspatial sampling and geometry. Its ability extends to diverse problem domains,\nincluding PDE solving, spatio-temporal forecasting, and inverse problems like\ngeometric design. CORAL demonstrates robust performance across multiple\nresolutions and performs well in both convex and non-convex domains, surpassing\nor performing on par with state-of-the-art models.\n","authors":["Louis Serrano","Lise Le Boudec","Armand Kassaï Koupaï","Thomas X Wang","Yuan Yin","Jean-Noël Vittaut","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2306.07266v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18559v1","updated":"2023-11-30T13:50:38Z","published":"2023-11-30T13:50:38Z","title":"FediOS: Decoupling Orthogonal Subspaces for Personalization in\n Feature-skew Federated Learning","summary":" Personalized federated learning (pFL) enables collaborative training among\nmultiple clients to enhance the capability of customized local models. In pFL,\nclients may have heterogeneous (also known as non-IID) data, which poses a key\nchallenge in how to decouple the data knowledge into generic knowledge for\nglobal sharing and personalized knowledge for preserving local personalization.\nA typical way of pFL focuses on label distribution skew, and they adopt a\ndecoupling scheme where the model is split into a common feature extractor and\ntwo prediction heads (generic and personalized). However, such a decoupling\nscheme cannot solve the essential problem of feature skew heterogeneity,\nbecause a common feature extractor cannot decouple the generic and personalized\nfeatures. Therefore, in this paper, we rethink the architecture decoupling\ndesign for feature-skew pFL and propose an effective pFL method called FediOS.\nIn FediOS, we reformulate the decoupling into two feature extractors (generic\nand personalized) and one shared prediction head. Orthogonal projections are\nused for clients to map the generic features into one common subspace and\nscatter the personalized features into different subspaces to achieve\ndecoupling for them. In addition, a shared prediction head is trained to\nbalance the importance of generic and personalized features during inference.\nExtensive experiments on four vision datasets demonstrate our method reaches\nstate-of-the-art pFL performances under feature skew heterogeneity.\n","authors":["Lingzhi Gao","Zexi Li","Yang Lu","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2311.18559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18558v1","updated":"2023-11-30T13:50:21Z","published":"2023-11-30T13:50:21Z","title":"Learning Radio Environments by Differentiable Ray Tracing","summary":" Ray tracing (RT) is instrumental in 6G research in order to generate\nspatially-consistent and environment-specific channel impulse responses (CIRs).\nWhile acquiring accurate scene geometries is now relatively straightforward,\ndetermining material characteristics requires precise calibration using channel\nmeasurements. We therefore introduce a novel gradient-based calibration method,\ncomplemented by differentiable parametrizations of material properties,\nscattering and antenna patterns. Our method seamlessly integrates with\ndifferentiable ray tracers that enable the computation of derivatives of CIRs\nwith respect to these parameters. Essentially, we approach field computation as\na large computational graph wherein parameters are trainable akin to weights of\na neural network (NN). We have validated our method using both synthetic data\nand real-world indoor channel measurements, employing a distributed\nmultiple-input multiple-output (MIMO) channel sounder.\n","authors":["Jakob Hoydis","Fayçal Aït Aoudia","Sebastian Cammerer","Florian Euchner","Merlin Nimier-David","Stephan ten Brink","Alexander Keller"],"pdf_url":"https://arxiv.org/pdf/2311.18558v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.18557v1","updated":"2023-11-30T13:48:50Z","published":"2023-11-30T13:48:50Z","title":"Can semi-supervised learning use all the data effectively? A lower bound\n perspective","summary":" Prior works have shown that semi-supervised learning algorithms can leverage\nunlabeled data to improve over the labeled sample complexity of supervised\nlearning (SL) algorithms. However, existing theoretical analyses focus on\nregimes where the unlabeled data is sufficient to learn a good decision\nboundary using unsupervised learning (UL) alone. This begs the question: Can\nSSL algorithms simultaneously improve upon both UL and SL? To this end, we\nderive a tight lower bound for 2-Gaussian mixture models that explicitly\ndepends on the labeled and the unlabeled dataset size as well as the\nsignal-to-noise ratio of the mixture distribution. Surprisingly, our result\nimplies that no SSL algorithm can improve upon the minimax-optimal statistical\nerror rates of SL or UL algorithms for these distributions. Nevertheless, we\nshow empirically on real-world data that SSL algorithms can still outperform UL\nand SL methods. Therefore, our work suggests that, while proving performance\ngains for SSL algorithms is possible, it requires careful tracking of\nconstants.\n","authors":["Alexandru Ţifrea","Gizem Yüce","Amartya Sanyal","Fanny Yang"],"pdf_url":"https://arxiv.org/pdf/2311.18557v1.pdf","comment":"Published in Advances in Neural Information Processing Systems 2023"},{"id":"http://arxiv.org/abs/2311.18553v1","updated":"2023-11-30T13:46:05Z","published":"2023-11-30T13:46:05Z","title":"Heterogeneous Graph-based Trajectory Prediction using Local Map Context\n and Social Interactions","summary":" Precisely predicting the future trajectories of surrounding traffic\nparticipants is a crucial but challenging problem in autonomous driving, due to\ncomplex interactions between traffic agents, map context and traffic rules.\nVector-based approaches have recently shown to achieve among the best\nperformances on trajectory prediction benchmarks. These methods model simple\ninteractions between traffic agents but don't distinguish between relation-type\nand attributes like their distance along the road. Furthermore, they represent\nlanes only by sequences of vectors representing center lines and ignore context\ninformation like lane dividers and other road elements. We present a novel\napproach for vector-based trajectory prediction that addresses these\nshortcomings by leveraging three crucial sources of information: First, we\nmodel interactions between traffic agents by a semantic scene graph, that\naccounts for the nature and important features of their relation. Second, we\nextract agent-centric image-based map features to model the local map context.\nFinally, we generate anchor paths to enforce the policy in multi-modal\nprediction to permitted trajectories only. Each of these three enhancements\nshows advantages over the baseline model HoliGraph.\n","authors":["Daniel Grimm","Maximilian Zipfl","Felix Hertlein","Alexander Naumann","Jürgen Lüttin","Steffen Thoma","Stefan Schmid","Lavdim Halilaj","Achim Rettinger","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.18553v1.pdf","comment":"Accepted on IEEE ITSC 2023"},{"id":"http://arxiv.org/abs/2311.12630v2","updated":"2023-11-30T13:44:21Z","published":"2023-11-21T14:24:21Z","title":"Hierarchical Joint Graph Learning and Multivariate Time Series\n Forecasting","summary":" Multivariate time series is prevalent in many scientific and industrial\ndomains. Modeling multivariate signals is challenging due to their long-range\ntemporal dependencies and intricate interactions--both direct and indirect. To\nconfront these complexities, we introduce a method of representing multivariate\nsignals as nodes in a graph with edges indicating interdependency between them.\nSpecifically, we leverage graph neural networks (GNN) and attention mechanisms\nto efficiently learn the underlying relationships within the time series data.\nMoreover, we suggest employing hierarchical signal decompositions running over\nthe graphs to capture multiple spatial dependencies. The effectiveness of our\nproposed model is evaluated across various real-world benchmark datasets\ndesigned for long-term forecasting tasks. The results consistently showcase the\nsuperiority of our model, achieving an average 23\\% reduction in mean squared\nerror (MSE) compared to existing models.\n","authors":["Juhyeon Kim","Hyungeun Lee","Seungwon Yu","Ung Hwang","Wooyul Jung","Miseon Park","Kijung Yoon"],"pdf_url":"https://arxiv.org/pdf/2311.12630v2.pdf","comment":"Temporal Graph Learning Workshop @ NeurIPS 2023, New Orleans, United\n States"},{"id":"http://arxiv.org/abs/2311.18547v1","updated":"2023-11-30T13:30:00Z","published":"2023-11-30T13:30:00Z","title":"Real-Time Vibration-Based Bearing Fault Diagnosis Under Time-Varying\n Speed Conditions","summary":" Detection of rolling-element bearing faults is crucial for implementing\nproactive maintenance strategies and for minimizing the economic and\noperational consequences of unexpected failures. However, many existing\ntechniques are developed and tested under strictly controlled conditions,\nlimiting their adaptability to the diverse and dynamic settings encountered in\npractical applications. This paper presents an efficient real-time\nconvolutional neural network (CNN) for diagnosing multiple bearing faults under\nvarious noise levels and time-varying rotational speeds. Additionally, we\npropose a novel Fisher-based spectral separability analysis (SSA) method to\nelucidate the effectiveness of the designed CNN model. We conducted experiments\non both healthy bearings and bearings afflicted with inner race, outer race,\nand roller ball faults. The experimental results show the superiority of our\nmodel over the current state-of-the-art approach in three folds: it achieves\nsubstantial accuracy gains of up to 15.8%, it is robust to noise with high\nperformance across various signal-to-noise ratios, and it runs in real-time\nwith processing durations five times less than acquisition. Additionally, by\nusing the proposed SSA technique, we offer insights into the model's\nperformance and underscore its effectiveness in tackling real-world challenges.\n","authors":["Tuomas Jalonen","Mohammad Al-Sa'd","Serkan Kiranyaz","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2311.18547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18540v1","updated":"2023-11-30T13:22:15Z","published":"2023-11-30T13:22:15Z","title":"Match me if you can: Semantic Correspondence Learning with Unpaired\n Images","summary":" Recent approaches for semantic correspondence have focused on obtaining\nhigh-quality correspondences using a complicated network, refining the\nambiguous or noisy matching points. Despite their performance improvements,\nthey remain constrained by the limited training pairs due to costly point-level\nannotations. This paper proposes a simple yet effective method that performs\ntraining with unlabeled pairs to complement both limited image pairs and sparse\npoint pairs, requiring neither extra labeled keypoints nor trainable modules.\nWe fundamentally extend the data quantity and variety by augmenting new\nunannotated pairs not primitively provided as training pairs in benchmarks.\nUsing a simple teacher-student framework, we offer reliable pseudo\ncorrespondences to the student network via machine supervision. Finally, the\nperformance of our network is steadily improved by the proposed iterative\ntraining, putting back the student as a teacher to generate refined labels and\ntrain a new student repeatedly. Our models outperform the milestone baselines,\nincluding state-of-the-art methods on semantic correspondence benchmarks.\n","authors":["Jiwon Kim","Byeongho Heo","Sangdoo Yun","Seungryong Kim","Dongyoon Han"],"pdf_url":"https://arxiv.org/pdf/2311.18540v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2311.18531v1","updated":"2023-11-30T13:15:28Z","published":"2023-11-30T13:15:28Z","title":"Dataset Distillation via the Wasserstein Metric","summary":" Dataset distillation (DD) offers a compelling approach in computer vision,\nwith the goal of condensing extensive datasets into smaller synthetic versions\nwithout sacrificing much of the model performance. In this paper, we continue\nto study the methods for DD, by addressing its conceptually core objective: how\nto capture the essential representation of extensive datasets in smaller,\nsynthetic forms.\n We propose a novel approach utilizing the Wasserstein distance, a metric\nrooted in optimal transport theory, to enhance distribution matching in DD. Our\nmethod leverages the Wasserstein barycenter, offering a geometrically\nmeaningful way to quantify distribution differences and effectively capture the\ncentroid of a set of distributions. Our approach retains the computational\nbenefits of distribution matching-based methods while achieving new\nstate-of-the-art performance on several benchmarks.\n To provide useful prior for learning the images, we embed the synthetic data\ninto the feature space of pretrained classification models to conduct\ndistribution matching. Extensive testing on various high-resolution datasets\nconfirms the effectiveness and adaptability of our method, indicating the\npromising yet unexplored capabilities of Wasserstein metrics in dataset\ndistillation.\n","authors":["Haoyang Liu","Tiancheng Xing","Luwei Li","Vibhu Dalal","Jingrui He","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18531v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.18526v1","updated":"2023-11-30T13:05:39Z","published":"2023-11-30T13:05:39Z","title":"HOT: Higher-Order Dynamic Graph Representation Learning with Efficient\n Transformers","summary":" Many graph representation learning (GRL) problems are dynamic, with millions\nof edges added or removed per second. A fundamental workload in this setting is\ndynamic link prediction: using a history of graph updates to predict whether a\ngiven pair of vertices will become connected. Recent schemes for link\nprediction in such dynamic settings employ Transformers, modeling individual\ngraph updates as single tokens. In this work, we propose HOT: a model that\nenhances this line of works by harnessing higher-order (HO) graph structures;\nspecifically, k-hop neighbors and more general subgraphs containing a given\npair of vertices. Harnessing such HO structures by encoding them into the\nattention matrix of the underlying Transformer results in higher accuracy of\nlink prediction outcomes, but at the expense of increased memory pressure. To\nalleviate this, we resort to a recent class of schemes that impose hierarchy on\nthe attention matrix, significantly reducing memory footprint. The final design\noffers a sweetspot between high accuracy and low memory utilization. HOT\noutperforms other dynamic GRL schemes, for example achieving 9%, 7%, and 15%\nhigher accuracy than - respectively - DyGFormer, TGN, and GraphMixer, for the\nMOOC dataset. Our design can be seamlessly extended towards other dynamic GRL\nworkloads.\n","authors":["Maciej Besta","Afonso Claudino Catarino","Lukas Gianinazzi","Nils Blach","Piotr Nyczyk","Hubert Niewiadomski","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2311.18526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18525v1","updated":"2023-11-30T13:03:49Z","published":"2023-11-30T13:03:49Z","title":"Detecting Anomalous Network Communication Patterns Using Graph\n Convolutional Networks","summary":" To protect an organizations' endpoints from sophisticated cyberattacks,\nadvanced detection methods are required. In this research, we present\nGCNetOmaly: a graph convolutional network (GCN)-based variational autoencoder\n(VAE) anomaly detector trained on data that include connection events among\ninternal and external machines. As input, the proposed GCN-based VAE model\nreceives two matrices: (i) the normalized adjacency matrix, which represents\nthe connections among the machines, and (ii) the feature matrix, which includes\nvarious features (demographic, statistical, process-related, and Node2vec\nstructural features) that are used to profile the individual nodes/machines.\nAfter training the model on data collected for a predefined time window, the\nmodel is applied on the same data; the reconstruction score obtained by the\nmodel for a given machine then serves as the machine's anomaly score.\nGCNetOmaly was evaluated on real, large-scale data logged by Carbon Black EDR\nfrom a large financial organization's automated teller machines (ATMs) as well\nas communication with Active Directory (AD) servers in two setups: unsupervised\nand supervised. The results of our evaluation demonstrate GCNetOmaly's\neffectiveness in detecting anomalous behavior of machines on unsupervised data.\n","authors":["Yizhak Vaisman","Gilad Katz","Yuval Elovici","Asaf Shabtai"],"pdf_url":"https://arxiv.org/pdf/2311.18525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01858v2","updated":"2023-11-30T13:01:17Z","published":"2022-11-03T14:50:17Z","title":"Relating graph auto-encoders to linear models","summary":" Graph auto-encoders are widely used to construct graph representations in\nEuclidean vector spaces. However, it has already been pointed out empirically\nthat linear models on many tasks can outperform graph auto-encoders. In our\nwork, we prove that the solution space induced by graph auto-encoders is a\nsubset of the solution space of a linear map. This demonstrates that linear\nembedding models have at least the representational power of graph\nauto-encoders based on graph convolutional networks. So why are we still using\nnonlinear graph auto-encoders? One reason could be that actively restricting\nthe linear solution space might introduce an inductive bias that helps improve\nlearning and generalization. While many researchers believe that the\nnonlinearity of the encoder is the critical ingredient towards this end, we\ninstead identify the node features of the graph as a more powerful inductive\nbias. We give theoretical insights by introducing a corresponding bias in a\nlinear model and analyzing the change in the solution space. Our experiments\nare aligned with other empirical work on this question and show that the linear\nencoder can outperform the nonlinear encoder when using feature information.\n","authors":["Solveig Klepper","Ulrike von Luxburg"],"pdf_url":"https://arxiv.org/pdf/2211.01858v2.pdf","comment":"accepted to TMLR"},{"id":"http://arxiv.org/abs/2311.18521v1","updated":"2023-11-30T12:55:51Z","published":"2023-11-30T12:55:51Z","title":"Combining deep generative models with extreme value theory for synthetic\n hazard simulation: a multivariate and spatially coherent approach","summary":" Climate hazards can cause major disasters when they occur simultaneously as\ncompound hazards. To understand the distribution of climate risk and inform\nadaptation policies, scientists need to simulate a large number of physically\nrealistic and spatially coherent events. Current methods are limited by\ncomputational constraints and the probabilistic spatial distribution of\ncompound events is not given sufficient attention. The bottleneck in current\napproaches lies in modelling the dependence structure between variables, as\ninference on parametric models suffers from the curse of dimensionality.\nGenerative adversarial networks (GANs) are well-suited to such a problem due to\ntheir ability to implicitly learn the distribution of data in high-dimensional\nsettings. We employ a GAN to model the dependence structure for daily maximum\nwind speed, significant wave height, and total precipitation over the Bay of\nBengal, combining this with traditional extreme value theory for controlled\nextrapolation of the tails. Once trained, the model can be used to efficiently\ngenerate thousands of realistic compound hazard events, which can inform\nclimate risk assessments for climate adaptation and disaster preparedness. The\nmethod developed is flexible and transferable to other multivariate and spatial\nclimate datasets.\n","authors":["Alison Peard","Jim Hall"],"pdf_url":"https://arxiv.org/pdf/2311.18521v1.pdf","comment":"Accepted at NeurIPS 2023 Workshop: Tackling Climate Change with\n Machine Learning (CCAI)"},{"id":"http://arxiv.org/abs/2305.19007v2","updated":"2023-11-30T12:54:59Z","published":"2023-05-30T13:03:54Z","title":"Training a HyperDimensional Computing Classifier using a Threshold on\n its Confidence","summary":" Hyperdimensional computing (HDC) has become popular for light-weight and\nenergy-efficient machine learning, suitable for wearable Internet-of-Things\n(IoT) devices and near-sensor or on-device processing. HDC is computationally\nless complex than traditional deep learning algorithms and achieves moderate to\ngood classification performance. This article proposes to extend the training\nprocedure in HDC by taking into account not only wrongly classified samples,\nbut also samples that are correctly classified by the HDC model but with low\nconfidence. As such, a confidence threshold is introduced that can be tuned for\neach dataset to achieve the best classification accuracy. The proposed training\nprocedure is tested on UCIHAR, CTG, ISOLET and HAND dataset for which the\nperformance consistently improves compared to the baseline across a range of\nconfidence threshold values. The extended training procedure also results in a\nshift towards higher confidence values of the correctly classified samples\nmaking the classifier not only more accurate but also more confident about its\npredictions.\n","authors":["Laura Smets","Werner Van Leekwijck","Ing Jyh Tsang","Steven Latre"],"pdf_url":"https://arxiv.org/pdf/2305.19007v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18520v1","updated":"2023-11-30T12:53:43Z","published":"2023-11-30T12:53:43Z","title":"Calibration-free online test-time adaptation for electroencephalography\n motor imagery decoding","summary":" Providing a promising pathway to link the human brain with external devices,\nBrain-Computer Interfaces (BCIs) have seen notable advancements in decoding\ncapabilities, primarily driven by increasingly sophisticated techniques,\nespecially deep learning. However, achieving high accuracy in real-world\nscenarios remains a challenge due to the distribution shift between sessions\nand subjects. In this paper we will explore the concept of online test-time\nadaptation (OTTA) to continuously adapt the model in an unsupervised fashion\nduring inference time. Our approach guarantees the preservation of privacy by\neliminating the requirement to access the source data during the adaptation\nprocess. Additionally, OTTA achieves calibration-free operation by not\nrequiring any session- or subject-specific data. We will investigate the task\nof electroencephalography (EEG) motor imagery decoding using a lightweight\narchitecture together with different OTTA techniques like alignment, adaptive\nbatch normalization, and entropy minimization. We examine two datasets and\nthree distinct data settings for a comprehensive analysis. Our adaptation\nmethods produce state-of-the-art results, potentially instigating a shift in\ntransfer learning for BCI decoding towards online adaptation.\n","authors":["Martin Wimpff","Mario Döbler","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.18520v1.pdf","comment":"6 pages, 4 figures, submitted to: 12th International Winter\n Conference on Brain-Computer Interface 2024"},{"id":"http://arxiv.org/abs/2309.03720v2","updated":"2023-11-30T12:48:13Z","published":"2023-09-07T13:52:20Z","title":"A Natural Gas Consumption Forecasting System for Continual Learning\n Scenarios based on Hoeffding Trees with Change Point Detection Mechanism","summary":" Forecasting natural gas consumption, considering seasonality and trends, is\ncrucial in planning its supply and consumption and optimizing the cost of\nobtaining it, mainly by industrial entities. However, in times of threats to\nits supply, it is also a critical element that guarantees the supply of this\nraw material to meet individual consumers' needs, ensuring society's energy\nsecurity. This article introduces a novel multistep ahead forecasting of\nnatural gas consumption with change point detection integration for model\ncollection selection with continual learning capabilities using data stream\nprocessing. The performance of the forecasting models based on the proposed\napproach is evaluated in a complex real-world use case of natural gas\nconsumption forecasting. We employed Hoeffding tree predictors as forecasting\nmodels and the Pruned Exact Linear Time (PELT) algorithm for the change point\ndetection procedure. The change point detection integration enables selecting a\ndifferent model collection for successive time frames. Thus, three model\ncollection selection procedures (with and without an error feedback loop) are\ndefined and evaluated for forecasting scenarios with various densities of\ndetected change points. These models were compared with change point agnostic\nbaseline approaches. Our experiments show that fewer change points result in a\nlower forecasting error regardless of the model collection selection procedure\nemployed. Also, simpler model collection selection procedures omitting\nforecasting error feedback leads to more robust forecasting models suitable for\ncontinual learning tasks.\n","authors":["Radek Svoboda","Sebastian Basterrech","Jędrzej Kozal","Jan Platoš","Michał Woźniak"],"pdf_url":"https://arxiv.org/pdf/2309.03720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18512v1","updated":"2023-11-30T12:40:23Z","published":"2023-11-30T12:40:23Z","title":"Revisiting Proposal-based Object Detection","summary":" This paper revisits the pipeline for detecting objects in images with\nproposals. For any object detector, the obtained box proposals or queries need\nto be classified and regressed towards ground truth boxes. The common solution\nfor the final predictions is to directly maximize the overlap between each\nproposal and the ground truth box, followed by a winner-takes-all ranking or\nnon-maximum suppression. In this work, we propose a simple yet effective\nalternative. For proposal regression, we solve a simpler problem where we\nregress to the area of intersection between proposal and ground truth. In this\nway, each proposal only specifies which part contains the object, avoiding a\nblind inpainting problem where proposals need to be regressed beyond their\nvisual scope. In turn, we replace the winner-takes-all strategy and obtain the\nfinal prediction by taking the union over the regressed intersections of a\nproposal group surrounding an object. Our revisited approach comes with minimal\nchanges to the detection pipeline and can be plugged into any existing method.\nWe show that our approach directly improves canonical object detection and\ninstance segmentation architectures, highlighting the utility of\nintersection-based regression and grouping.\n","authors":["Aritra Bhowmik","Martin R. Oswald","Pascal Mettes","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2311.18512v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.18506v1","updated":"2023-11-30T12:30:42Z","published":"2023-11-30T12:30:42Z","title":"Global Convergence of Online Identification for Mixed Linear Regression","summary":" Mixed linear regression (MLR) is a powerful model for characterizing\nnonlinear relationships by utilizing a mixture of linear regression sub-models.\nThe identification of MLR is a fundamental problem, where most of the existing\nresults focus on offline algorithms, rely on independent and identically\ndistributed (i.i.d) data assumptions, and provide local convergence results\nonly. This paper investigates the online identification and data clustering\nproblems for two basic classes of MLRs, by introducing two corresponding new\nonline identification algorithms based on the expectation-maximization (EM)\nprinciple. It is shown that both algorithms will converge globally without\nresorting to the traditional i.i.d data assumptions. The main challenge in our\ninvestigation lies in the fact that the gradient of the maximum likelihood\nfunction does not have a unique zero, and a key step in our analysis is to\nestablish the stability of the corresponding differential equation in order to\napply the celebrated Ljung's ODE method. It is also shown that the\nwithin-cluster error and the probability that the new data is categorized into\nthe correct cluster are asymptotically the same as those in the case of known\nparameters. Finally, numerical simulations are provided to verify the\neffectiveness of our online algorithms.\n","authors":["Yujing Liu","Zhixin Liu","Lei Guo"],"pdf_url":"https://arxiv.org/pdf/2311.18506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.03077v4","updated":"2023-11-30T12:23:29Z","published":"2022-09-07T11:33:32Z","title":"On the Convergence of the ELBO to Entropy Sums","summary":" The variational lower bound (a.k.a. ELBO or free energy) is the central\nobjective for many established as well as many novel algorithms for\nunsupervised learning. Learning algorithms change model parameters such that\nthe variational lower bound increases. Learning usually proceeds until\nparameters have converged to values close to a stationary point of the learning\ndynamics. In this purely theoretical contribution, we show that (for a very\nlarge class of generative models) the variational lower bound is at all\nstationary points of learning equal to a sum of entropies. For standard machine\nlearning models with one set of latents and one set observed variables, the sum\nconsists of three entropies: (A) the (average) entropy of the variational\ndistributions, (B) the negative entropy of the model's prior distribution, and\n(C) the (expected) negative entropy of the observable distributions. The\nobtained result applies under realistic conditions including: finite numbers of\ndata points, at any stationary points (including saddle points) and for any\nfamily of (well behaved) variational distributions. The class of generative\nmodels for which we show the equality to entropy sums contains many well-known\ngenerative models. As concrete examples we discuss Sigmoid Belief Networks,\nprobabilistic PCA and (Gaussian and non-Gaussian) mixture models. The results\nalso apply for standard (Gaussian) variational autoencoders, which has been\nshown in parallel (Damm et al., 2023). The prerequisites we use to show\nequality to entropy sums are relatively mild. Concretely, the distributions of\na given generative model have to be of the exponential family (with constant\nbase measure), and the model has to satisfy a parameterization criterion (which\nis usually fulfilled). Proving the equality of the ELBO to entropy sums at\nstationary points (under the stated conditions) is the main contribution of\nthis work.\n","authors":["Jörg Lücke","Jan Warnken"],"pdf_url":"https://arxiv.org/pdf/2209.03077v4.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2311.18498v1","updated":"2023-11-30T12:19:10Z","published":"2023-11-30T12:19:10Z","title":"Data-Agnostic Model Poisoning against Federated Learning: A Graph\n Autoencoder Approach","summary":" This paper proposes a novel, data-agnostic, model poisoning attack on\nFederated Learning (FL), by designing a new adversarial graph autoencoder\n(GAE)-based framework. The attack requires no knowledge of FL training data and\nachieves both effectiveness and undetectability. By listening to the benign\nlocal models and the global model, the attacker extracts the graph structural\ncorrelations among the benign local models and the training data features\nsubstantiating the models. The attacker then adversarially regenerates the\ngraph structural correlations while maximizing the FL training loss, and\nsubsequently generates malicious local models using the adversarial graph\nstructure and the training data features of the benign ones. A new algorithm is\ndesigned to iteratively train the malicious local models using GAE and\nsub-gradient descent. The convergence of FL under attack is rigorously proved,\nwith a considerably large optimality gap. Experiments show that the FL accuracy\ndrops gradually under the proposed attack and existing defense mechanisms fail\nto detect it. The attack can give rise to an infection across all benign\ndevices, making it a serious threat to FL.\n","authors":["Kai Li","Jingjing Zheng","Xin Yuan","Wei Ni","Ozgur B. Akan","H. Vincent Poor"],"pdf_url":"https://arxiv.org/pdf/2311.18498v1.pdf","comment":"15 pages, 10 figures, submitted to IEEE Transactions on Information\n Forensics and Security (TIFS)"},{"id":"http://arxiv.org/abs/2311.18495v1","updated":"2023-11-30T12:15:49Z","published":"2023-11-30T12:15:49Z","title":"Improving Adversarial Transferability via Model Alignment","summary":" Neural networks are susceptible to adversarial perturbations that are\ntransferable across different models. In this paper, we introduce a novel model\nalignment technique aimed at improving a given source model's ability in\ngenerating transferable adversarial perturbations. During the alignment\nprocess, the parameters of the source model are fine-tuned to minimize an\nalignment loss. This loss measures the divergence in the predictions between\nthe source model and another, independently trained model, referred to as the\nwitness model. To understand the effect of model alignment, we conduct a\ngeometric anlaysis of the resulting changes in the loss landscape. Extensive\nexperiments on the ImageNet dataset, using a variety of model architectures,\ndemonstrate that perturbations generated from aligned source models exhibit\nsignificantly higher transferability than those from the original source model.\n","authors":["Avery Ma","Amir-massoud Farahmand","Yangchen Pan","Philip Torr","Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2311.18495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18491v1","updated":"2023-11-30T12:06:15Z","published":"2023-11-30T12:06:15Z","title":"ZeST-NeRF: Using temporal aggregation for Zero-Shot Temporal NeRFs","summary":" In the field of media production, video editing techniques play a pivotal\nrole. Recent approaches have had great success at performing novel view image\nsynthesis of static scenes. But adding temporal information adds an extra layer\nof complexity. Previous models have focused on implicitly representing static\nand dynamic scenes using NeRF. These models achieve impressive results but are\ncostly at training and inference time. They overfit an MLP to describe the\nscene implicitly as a function of position. This paper proposes ZeST-NeRF, a\nnew approach that can produce temporal NeRFs for new scenes without retraining.\nWe can accurately reconstruct novel views using multi-view synthesis techniques\nand scene flow-field estimation, trained only with unrelated scenes. We\ndemonstrate how existing state-of-the-art approaches from a range of fields\ncannot adequately solve this new task and demonstrate the efficacy of our\nsolution. The resulting network improves quantitatively by 15% and produces\nsignificantly better visual results.\n","authors":["Violeta Menéndez González","Andrew Gilbert","Graeme Phillipson","Stephen Jolly","Simon Hadfield"],"pdf_url":"https://arxiv.org/pdf/2311.18491v1.pdf","comment":"VUA BMVC 2023"},{"id":"http://arxiv.org/abs/2311.15936v3","updated":"2023-11-30T11:54:38Z","published":"2023-11-27T15:45:02Z","title":"Towards Responsible Governance of Biological Design Tools","summary":" Recent advancements in generative machine learning have enabled rapid\nprogress in biological design tools (BDTs) such as protein structure and\nsequence prediction models. The unprecedented predictive accuracy and novel\ndesign capabilities of BDTs present new and significant dual-use risks. For\nexample, their predictive accuracy allows biological agents, whether vaccines\nor pathogens, to be developed more quickly, while the design capabilities could\nbe used to discover drugs or evade DNA screening techniques. Similar to other\ndual-use AI systems, BDTs present a wicked problem: how can regulators uphold\npublic safety without stifling innovation? We highlight how current regulatory\nproposals that are primarily tailored toward large language models may be less\neffective for BDTs, which require fewer computational resources to train and\nare often developed in an open-source manner. We propose a range of measures to\nmitigate the risk that BDTs are misused, across the areas of responsible\ndevelopment, risk assessment, transparency, access management, cybersecurity,\nand investing in resilience. Implementing such measures will require close\ncoordination between developers and governments.\n","authors":["Richard Moulange","Max Langenkamp","Tessa Alexanian","Samuel Curtis","Morgan Livingston"],"pdf_url":"https://arxiv.org/pdf/2311.15936v3.pdf","comment":"10 pages + references, 1 figure, accepted at NeurIPS 2023 Workshop on\n Regulatable ML as oral presentation"},{"id":"http://arxiv.org/abs/2311.18460v1","updated":"2023-11-30T11:11:26Z","published":"2023-11-30T11:11:26Z","title":"Causal Fairness under Unobserved Confounding: A Neural Sensitivity\n Framework","summary":" Fairness for machine learning predictions is widely required in practice for\nlegal, ethical, and societal reasons. Existing work typically focuses on\nsettings without unobserved confounding, even though unobserved confounding can\nlead to severe violations of causal fairness and, thus, unfair predictions. In\nthis work, we analyze the sensitivity of causal fairness to unobserved\nconfounding. Our contributions are three-fold. First, we derive bounds for\ncausal fairness metrics under different sources of unobserved confounding. This\nenables practitioners to examine the sensitivity of their machine learning\nmodels to unobserved confounding in fairness-critical applications. Second, we\npropose a novel neural framework for learning fair predictions, which allows us\nto offer worst-case guarantees of the extent to which causal fairness can be\nviolated due to unobserved confounding. Third, we demonstrate the effectiveness\nof our framework in a series of experiments, including a real-world case study\nabout predicting prison sentences. To the best of our knowledge, ours is the\nfirst work to study causal fairness under unobserved confounding. To this end,\nour work is of direct practical value as a refutation strategy to ensure the\nfairness of predictions in high-stakes applications.\n","authors":["Maresa Schröder","Dennis Frauen","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2311.18460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09912v3","updated":"2023-11-30T11:03:01Z","published":"2023-10-15T18:44:30Z","title":"Unsupervised Discovery of Interpretable Directions in h-space of\n Pre-trained Diffusion Models","summary":" We propose the first unsupervised and learning-based method to identify\ninterpretable directions in h-space of pre-trained diffusion models. Our method\nis derived from an existing technique that operates on the GAN latent space.\nSpecifically, we employ a shift control module that works on h-space of\npre-trained diffusion models to manipulate a sample into a shifted version of\nitself, followed by a reconstructor to reproduce both the type and the strength\nof the manipulation. By jointly optimizing them, the model will spontaneously\ndiscover disentangled and interpretable directions. To prevent the discovery of\nmeaningless and destructive directions, we employ a discriminator to maintain\nthe fidelity of shifted sample. Due to the iterative generative process of\ndiffusion models, our training requires a substantial amount of GPU VRAM to\nstore numerous intermediate tensors for back-propagating gradient. To address\nthis issue, we propose a general VRAM-efficient training algorithm based on\ngradient checkpointing technique to back-propagate any gradient through the\nwhole generative process, with acceptable occupancy of VRAM and sacrifice of\ntraining efficiency. Compared with existing related works on diffusion models,\nour method inherently identifies global and scalable directions, without\nnecessitating any other complicated procedures. Extensive experiments on\nvarious datasets demonstrate the effectiveness of our method.\n","authors":["Zijian Zhang","Luping Liu","Zhijie Lin","Yichen Zhu","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.09912v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18451v1","updated":"2023-11-30T10:51:46Z","published":"2023-11-30T10:51:46Z","title":"How Much Is Hidden in the NAS Benchmarks? Few-Shot Adaptation of a NAS\n Predictor","summary":" Neural architecture search has proven to be a powerful approach to designing\nand refining neural networks, often boosting their performance and efficiency\nover manually-designed variations, but comes with computational overhead. While\nthere has been a considerable amount of research focused on lowering the cost\nof NAS for mainstream tasks, such as image classification, a lot of those\nimprovements stem from the fact that those tasks are well-studied in the\nbroader context. Consequently, applicability of NAS to emerging and\nunder-represented domains is still associated with a relatively high cost\nand/or uncertainty about the achievable gains. To address this issue, we turn\nour focus towards the recent growth of publicly available NAS benchmarks in an\nattempt to extract general NAS knowledge, transferable across different tasks\nand search spaces. We borrow from the rich field of meta-learning for few-shot\nadaptation and carefully study applicability of those methods to NAS, with a\nspecial focus on the relationship between task-level correlation (domain shift)\nand predictor transferability; which we deem critical for improving NAS on\ndiverse tasks. In our experiments, we use 6 NAS benchmarks in conjunction,\nspanning in total 16 NAS settings -- our meta-learning approach not only shows\nsuperior (or matching) performance in the cross-validation experiments but also\nsuccessful extrapolation to a new search space and tasks.\n","authors":["Hrushikesh Loya","Łukasz Dudziak","Abhinav Mehrotra","Royson Lee","Javier Fernandez-Marques","Nicholas D. Lane","Hongkai Wen"],"pdf_url":"https://arxiv.org/pdf/2311.18451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10696v2","updated":"2023-11-30T10:42:26Z","published":"2022-12-21T00:00:01Z","title":"Analyzing Semantic Faithfulness of Language Models via Input\n Intervention on Question Answering","summary":" Transformer-based language models have been shown to be highly effective for\nseveral NLP tasks. In this paper, we consider three transformer models, BERT,\nRoBERTa, and XLNet, in both small and large versions, and investigate how\nfaithful their representations are with respect to the semantic content of\ntexts. We formalize a notion of semantic faithfulness, in which the semantic\ncontent of a text should causally figure in a model's inferences in question\nanswering. We then test this notion by observing a model's behavior on\nanswering questions about a story after performing two novel semantic\ninterventions: deletion intervention and negation intervention. While\ntransformer models achieve high performance on standard question answering\ntasks, we show that they fail to be semantically faithful once we perform these\ninterventions for a significant number of cases (~50% for deletion\nintervention, and ~20% drop in accuracy for negation intervention). We then\npropose an intervention-based training regime that can mitigate the undesirable\neffects for deletion intervention by a significant margin (from ~ 50% to ~6%).\nWe analyze the inner-workings of the models to better understand the\neffectiveness of intervention-based training for deletion intervention. But we\nshow that this training does not attenuate other aspects of semantic\nunfaithfulness such as the models' inability to deal with negation intervention\nor to capture the predicate-argument structure of texts. We also test\nInstructGPT, via prompting, for its ability to handle the two interventions and\nto capture predicate-argument structure. While InstructGPT models do achieve\nvery high performance on predicate-argument structure task, they fail to\nrespond adequately to our deletion and negation interventions.\n","authors":["Akshay Chaturvedi","Swarnadeep Bhar","Soumadeep Saha","Utpal Garain","Nicholas Asher"],"pdf_url":"https://arxiv.org/pdf/2212.10696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18437v1","updated":"2023-11-30T10:37:03Z","published":"2023-11-30T10:37:03Z","title":"The Sliding Regret in Stochastic Bandits: Discriminating Index and\n Randomized Policies","summary":" This paper studies the one-shot behavior of no-regret algorithms for\nstochastic bandits. Although many algorithms are known to be asymptotically\noptimal with respect to the expected regret, over a single run, their\npseudo-regret seems to follow one of two tendencies: it is either smooth or\nbumpy. To measure this tendency, we introduce a new notion: the sliding regret,\nthat measures the worst pseudo-regret over a time-window of fixed length\nsliding to infinity. We show that randomized methods (e.g. Thompson Sampling\nand MED) have optimal sliding regret, while index policies, although possibly\nasymptotically optimal for the expected regret, have the worst possible sliding\nregret under regularity conditions on their index (e.g. UCB, UCB-V, KL-UCB,\nMOSS, IMED etc.). We further analyze the average bumpiness of the pseudo-regret\nof index policies via the regret of exploration, that we show to be suboptimal\nas well.\n","authors":["Victor Boone"],"pdf_url":"https://arxiv.org/pdf/2311.18437v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2309.05950v3","updated":"2023-11-30T10:35:40Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities on downstream tasks when fine-tuned with\nminimal data. However, many VLMs rely on proprietary data and are not\nopen-source, which restricts the use of white-box approaches for fine-tuning.\nAs such, we aim to develop a black-box approach to optimize VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or even output logits. We propose employing chat-based LLMs\nto search for the best text prompt for VLMs. Specifically, we adopt an\nautomatic hill-climbing procedure that converges to an effective prompt by\nevaluating the performance of current prompts and asking LLMs to refine them\nbased on textual feedback, all within a conversational process without\nhuman-in-the-loop. In a challenging 1-shot image classification setup, our\nsimple approach surpasses the white-box continuous prompting method (CoOp) by\nan average of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms both human-engineered and LLM-generated prompts. We highlight the\nadvantage of conversational feedback that incorporates both positive and\nnegative prompts, suggesting that LLMs can utilize the implicit gradient\ndirection in textual feedback for a more efficient search. In addition, we find\nthat the text prompts generated through our strategy are not only more\ninterpretable but also transfer well across different VLM architectures in a\nblack-box manner. Lastly, we demonstrate our framework on a state-of-the-art\nblack-box VLM (DALL-E 3) for text-to-image optimization.\n","authors":["Shihong Liu","Zhiqiu Lin","Samuel Yu","Ryan Lee","Tiffany Ling","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v3.pdf","comment":"Project site: llm-can-optimize-vlm.github.io"},{"id":"http://arxiv.org/abs/2311.18434v1","updated":"2023-11-30T10:34:29Z","published":"2023-11-30T10:34:29Z","title":"Exploring the Temperature-Dependent Phase Transition in Modern Hopfield\n Networks","summary":" The recent discovery of a connection between Transformers and Modern Hopfield\nNetworks (MHNs) has reignited the study of neural networks from a physical\nenergy-based perspective. This paper focuses on the pivotal effect of the\ninverse temperature hyperparameter $\\beta$ on the distribution of energy minima\nof the MHN. To achieve this, the distribution of energy minima is tracked in a\nsimplified MHN in which equidistant normalised patterns are stored. This\nnetwork demonstrates a phase transition at a critical temperature\n$\\beta_{\\text{c}}$, from a single global attractor towards highly pattern\nspecific minima as $\\beta$ is increased. Importantly, the dynamics are not\nsolely governed by the hyperparameter $\\beta$ but are instead determined by an\neffective inverse temperature $\\beta_{\\text{eff}}$ which also depends on the\ndistribution and size of the stored patterns. Recognizing the role of\nhyperparameters in the MHN could, in the future, aid researchers in the domain\nof Transformers to optimise their initial choices, potentially reducing the\nnecessity for time and energy expensive hyperparameter fine-tuning.\n","authors":["Felix Koulischer","Cédric Goemaere","Tom van der Meersch","Johannes Deleu","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2311.18434v1.pdf","comment":"Accepted as poster for Associative Memory and Hopfield Networks\n workshop at NeurIPS23"},{"id":"http://arxiv.org/abs/2311.18431v1","updated":"2023-11-30T10:29:43Z","published":"2023-11-30T10:29:43Z","title":"On the convergence of adaptive first order methods: proximal gradient\n and alternating minimization algorithms","summary":" Building upon recent works on linesearch-free adaptive proximal gradient\nmethods, this paper proposes AdaPG$^{\\pi,r}$, a framework that unifies and\nextends existing results by providing larger stepsize policies and improved\nlower bounds. Different choices of the parameters $\\pi$ and $r$ are discussed\nand the efficacy of the resulting methods is demonstrated through numerical\nsimulations. In an attempt to better understand the underlying theory, its\nconvergence is established in a more general setting that allows for\ntime-varying parameters. Finally, an adaptive alternating minimization\nalgorithm is presented by exploring the dual setting. This algorithm not only\nincorporates additional adaptivity, but also expands its applicability beyond\nstandard strongly convex settings.\n","authors":["Puya Latafat","Andreas Themelis","Panagiotis Patrinos"],"pdf_url":"https://arxiv.org/pdf/2311.18431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18426v1","updated":"2023-11-30T10:24:07Z","published":"2023-11-30T10:24:07Z","title":"Convergence Analysis of Fractional Gradient Descent","summary":" Fractional derivatives are a well-studied generalization of integer order\nderivatives. Naturally, for optimization, it is of interest to understand the\nconvergence properties of gradient descent using fractional derivatives.\nConvergence analysis of fractional gradient descent is currently limited both\nin the methods analyzed and the settings analyzed. This paper aims to fill in\nthese gaps by analyzing variations of fractional gradient descent in smooth and\nconvex, smooth and strongly convex, and smooth and non-convex settings. First,\nnovel bounds will be established bridging fractional and integer derivatives.\nThen, these bounds will be applied to the aforementioned settings to prove\n$O(1/T)$ convergence for smooth and convex functions and linear convergence for\nsmooth and strongly convex functions. Additionally, we prove $O(1/T)$\nconvergence for smooth and non-convex functions using an extended notion of\nsmoothness that is more natural for fractional derivatives. Finally, empirical\nresults will be presented on the potential speed up of fractional gradient\ndescent over standard gradient descent as well as the challenges of predicting\nwhich will be faster in general.\n","authors":["Ashwani Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2311.18426v1.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.04051v2","updated":"2023-11-30T10:21:47Z","published":"2023-08-08T04:57:58Z","title":"Generative Models for Anomaly Detection and Design-Space Dimensionality\n Reduction in Shape Optimization","summary":" Our work presents a novel approach to shape optimization, with the twofold\nobjective to improve the efficiency of global optimization algorithms while\npromoting the generation of high-quality designs during the optimization\nprocess free of geometrical anomalies. This is accomplished by reducing the\nnumber of the original design variables defining a new reduced subspace where\nthe geometrical variance is maximized and modeling the underlying generative\nprocess of the data via probabilistic linear latent variable models such as\nfactor analysis and probabilistic principal component analysis. We show that\nthe data follows approximately a Gaussian distribution when the shape\nmodification method is linear and the design variables are sampled uniformly at\nrandom, due to the direct application of the central limit theorem. The degree\nof anomalousness is measured in terms of Mahalanobis distance, and the paper\ndemonstrates that abnormal designs tend to exhibit a high value of this metric.\nThis enables the definition of a new optimization model where anomalous\ngeometries are penalized and consequently avoided during the optimization loop.\nThe procedure is demonstrated for hull shape optimization of the DTMB 5415\nmodel, extensively used as an international benchmark for shape optimization\nproblems. The global optimization routine is carried out using Bayesian\noptimization and the DIRECT algorithm. From the numerical results, the new\nframework improves the convergence of global optimization algorithms, while\nonly designs with high-quality geometrical features are generated through the\noptimization routine thereby avoiding the wastage of precious computationally\nexpensive simulations.\n","authors":["Danny D'Agostino"],"pdf_url":"https://arxiv.org/pdf/2308.04051v2.pdf","comment":"Accepted in Engineering Applications of Artificial Intelligence,\n Elsevier"},{"id":"http://arxiv.org/abs/2311.17431v2","updated":"2023-11-30T10:16:41Z","published":"2023-11-29T08:21:42Z","title":"Grounding Foundation Models through Federated Transfer Learning: A\n General Framework","summary":" Foundation Models (FMs) such as GPT-4 encoded with vast knowledge and\npowerful emergent abilities have achieved remarkable success in various natural\nlanguage processing and computer vision tasks. Grounding FMs by adapting them\nto domain-specific tasks or augmenting them with domain-specific knowledge\nenables us to exploit the full potential of FMs. However, grounding FMs faces\nseveral challenges, stemming primarily from constrained computing resources,\ndata privacy, model heterogeneity, and model ownership. Federated Transfer\nLearning (FTL), the combination of federated learning and transfer learning,\nprovides promising solutions to address these challenges. In recent years, the\nneed for grounding FMs leveraging FTL, coined FTL-FM, has arisen strongly in\nboth academia and industry. Motivated by the strong growth in FTL-FM research\nand the potential impact of FTL-FM on industrial applications, we propose an\nFTL-FM framework that formulates problems of grounding FMs in the federated\nlearning setting, construct a detailed taxonomy based on the FTL-FM framework\nto categorize state-of-the-art FTL-FM works, and comprehensively overview\nFTL-FM works based on the proposed taxonomy. We also establish correspondences\nbetween FTL-FM and conventional phases of adapting FM so that FM practitioners\ncan align their research works with FTL-FM. In addition, we overview advanced\nefficiency-improving and privacy-preserving techniques because efficiency and\nprivacy are critical concerns in FTL-FM. Last, we discuss opportunities and\nfuture research directions of FTL-FM.\n","authors":["Yan Kang","Tao Fan","Hanlin Gu","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.17431v2.pdf","comment":"in progress"},{"id":"http://arxiv.org/abs/2304.10253v2","updated":"2023-11-30T10:14:49Z","published":"2023-04-20T12:21:30Z","title":"Image retrieval outperforms diffusion models on data augmentation","summary":" Many approaches have been proposed to use diffusion models to augment\ntraining datasets for downstream tasks, such as classification. However,\ndiffusion models are themselves trained on large datasets, often with noisy\nannotations, and it remains an open question to which extent these models\ncontribute to downstream classification performance. In particular, it remains\nunclear if they generalize enough to improve over directly using the additional\ndata of their pre-training process for augmentation. We systematically evaluate\na range of existing methods to generate images from diffusion models and study\nnew extensions to assess their benefit for data augmentation. Personalizing\ndiffusion models towards the target data outperforms simpler prompting\nstrategies. However, using the pre-training data of the diffusion model alone,\nvia a simple nearest-neighbor retrieval procedure, leads to even stronger\ndownstream performance. Our study explores the potential of diffusion models in\ngenerating new training data, and surprisingly finds that these sophisticated\nmodels are not yet able to beat a simple and strong image retrieval baseline on\nsimple downstream vision tasks.\n","authors":["Max F. Burg","Florian Wenzel","Dominik Zietlow","Max Horn","Osama Makansi","Francesco Locatello","Chris Russell"],"pdf_url":"https://arxiv.org/pdf/2304.10253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18398v1","updated":"2023-11-30T09:49:16Z","published":"2023-11-30T09:49:16Z","title":"RainAI -- Precipitation Nowcasting from Satellite Data","summary":" This paper presents a solution to the Weather4Cast 2023 competition, where\nthe goal is to forecast high-resolution precipitation with an 8-hour lead time\nusing lower-resolution satellite radiance images. We propose a simple, yet\neffective method for spatiotemporal feature learning using a 2D U-Net model,\nthat outperforms the official 3D U-Net baseline in both performance and\nefficiency. We place emphasis on refining the dataset, through importance\nsampling and dataset preparation, and show that such techniques have a\nsignificant impact on performance. We further study an alternative\ncross-entropy loss function that improves performance over the standard mean\nsquared error loss, while also enabling models to produce probabilistic\noutputs. Additional techniques are explored regarding the generation of\npredictions at different lead times, specifically through Conditioning Lead\nTime. Lastly, to generate high-resolution forecasts, we evaluate standard and\nlearned upsampling methods. The code and trained parameters are available at\nhttps://github.com/rafapablos/w4c23-rainai.\n","authors":["Rafael Pablos Sarabia","Joachim Nyborg","Morten Birk","Ira Assent"],"pdf_url":"https://arxiv.org/pdf/2311.18398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18393v1","updated":"2023-11-30T09:38:59Z","published":"2023-11-30T09:38:59Z","title":"Data-efficient Deep Reinforcement Learning for Vehicle Trajectory\n Control","summary":" Advanced vehicle control is a fundamental building block in the development\nof autonomous driving systems. Reinforcement learning (RL) promises to achieve\ncontrol performance superior to classical approaches while keeping\ncomputational demands low during deployment. However, standard RL approaches\nlike soft-actor critic (SAC) require extensive amounts of training data to be\ncollected and are thus impractical for real-world application. To address this\nissue, we apply recently developed data-efficient deep RL methods to vehicle\ntrajectory control. Our investigation focuses on three methods, so far\nunexplored for vehicle control: randomized ensemble double Q-learning (REDQ),\nprobabilistic ensembles with trajectory sampling and model predictive path\nintegral optimizer (PETS-MPPI), and model-based policy optimization (MBPO). We\nfind that in the case of trajectory control, the standard model-based RL\nformulation used in approaches like PETS-MPPI and MBPO is not suitable. We,\ntherefore, propose a new formulation that splits dynamics prediction and\nvehicle localization. Our benchmark study on the CARLA simulator reveals that\nthe three identified data-efficient deep RL approaches learn control strategies\non a par with or better than SAC, yet reduce the required number of environment\ninteractions by more than one order of magnitude.\n","authors":["Bernd Frauenknecht","Tobias Ehlgen","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2311.18393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18387v1","updated":"2023-11-30T09:30:15Z","published":"2023-11-30T09:30:15Z","title":"On Exact Inversion of DPM-Solvers","summary":" Diffusion probabilistic models (DPMs) are a key component in modern\ngenerative models. DPM-solvers have achieved reduced latency and enhanced\nquality significantly, but have posed challenges to find the exact inverse\n(i.e., finding the initial noise from the given image). Here we investigate the\nexact inversions for DPM-solvers and propose algorithms to perform them when\nsamples are generated by the first-order as well as higher-order DPM-solvers.\nFor each explicit denoising step in DPM-solvers, we formulated the inversions\nusing implicit methods such as gradient descent or forward step method to\nensure the robustness to large classifier-free guidance unlike the prior\napproach using fixed-point iteration. Experimental results demonstrated that\nour proposed exact inversion methods significantly reduced the error of both\nimage and noise reconstructions, greatly enhanced the ability to distinguish\ninvisible watermarks and well prevented unintended background changes\nconsistently during image editing. Project page:\n\\url{https://smhongok.github.io/inv-dpm.html}.\n","authors":["Seongmin Hong","Kyeonghyun Lee","Suh Yoon Jeon","Hyewon Bae","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2311.18387v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2311.18377v1","updated":"2023-11-30T09:20:24Z","published":"2023-11-30T09:20:24Z","title":"Transfer Learning across Different Chemical Domains: Virtual Screening\n of Organic Materials with Deep Learning Models Pretrained on Small Molecule\n and Chemical Reaction Data","summary":" Machine learning prediction of organic materials properties is an efficient\nvirtual screening method ahead of more expensive screening methods. However,\nthis approach has suffered from insufficient labeled data on organic materials\nto train state-of-the-art machine learning models. In this study, we\ndemonstrate that drug-like small molecule and chemical reaction databases can\nbe used to pretrain the BERT model for the virtual screening of organic\nmaterials. Among the BERT models fine-tuned by five virtual screening tasks on\norganic materials, the USPTO-SMILES pretrained BERT model had R2 > 0.90 for two\ntasks and R2 > 0.82 for one, which was generally superior to the same models\npretrained by the small molecule or organic materials databases, as well as to\nthe other three traditional machine learning models trained directly on the\nvirtual screening task data. The superior performance of the USPTO-SMILES\npretrained BERT model is due to the greater variety of organic building blocks\nin the USPTO database and the broader coverage of the chemical space. The even\nbetter performance of the BERT model pretrained externally from a chemical\nreaction database with additional sources of chemical reactions strengthens our\nproof of concept that transfer learning across different chemical domains is\npractical for the virtual screening of organic materials.\n","authors":["Chengwei Zhang","Yushuang Zhai","Ziyang Gong","Yuan-Bin She","Yun-Fang Yang","An Su"],"pdf_url":"https://arxiv.org/pdf/2311.18377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16605v2","updated":"2023-11-30T09:19:39Z","published":"2023-11-28T08:45:37Z","title":"LasTGL: An Industrial Framework for Large-Scale Temporal Graph Learning","summary":" Over the past few years, graph neural networks (GNNs) have become powerful\nand practical tools for learning on (static) graph-structure data. However,\nmany real-world applications, such as social networks and e-commerce, involve\ntemporal graphs where nodes and edges are dynamically evolving. Temporal graph\nneural networks (TGNNs) have progressively emerged as an extension of GNNs to\naddress time-evolving graphs and have gradually become a trending research\ntopic in both academics and industry. Advancing research and application in\nsuch an emerging field necessitates the development of new tools to compose\nTGNN models and unify their different schemes for dealing with temporal graphs.\nIn this work, we introduce LasTGL, an industrial framework that integrates\nunified and extensible implementations of common temporal graph learning\nalgorithms for various advanced tasks. The purpose of LasTGL is to provide the\nessential building blocks for solving temporal graph learning tasks, focusing\non the guiding principles of user-friendliness and quick prototyping on which\nPyTorch is based. In particular, LasTGL provides comprehensive temporal graph\ndatasets, TGNN models and utilities along with well-documented tutorials,\nmaking it suitable for both absolute beginners and expert deep learning\npractitioners alike.\n","authors":["Jintang Li","Jiawang Dan","Ruofan Wu","Jing Zhou","Sheng Tian","Yunfei Liu","Baokun Wang","Changhua Meng","Weiqiang Wang","Yuchang Zhu","Liang Chen","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2311.16605v2.pdf","comment":"Preprint; Work in progress"},{"id":"http://arxiv.org/abs/2311.18376v1","updated":"2023-11-30T09:19:12Z","published":"2023-11-30T09:19:12Z","title":"Age Effects on Decision-Making, Drift Diffusion Model","summary":" Training can improve human decision-making performance. After several\ntraining sessions, a person can quickly and accurately complete a task.\nHowever, decision-making is always a trade-off between accuracy and response\ntime. Factors such as age and drug abuse can affect the decision-making\nprocess. This study examines how training can improve the performance of\ndifferent age groups in completing a random dot motion (RDM) task. The\nparticipants are divided into two groups: old and young. They undergo a\nthree-phase training and then repeat the same RDM task. The hierarchical\ndrift-diffusion model analyzes the subjects' responses and determines how the\nmodel's parameters change after training for both age groups. The results show\nthat after training, the participants were able to accumulate sensory\ninformation faster, and the model drift rate increased. However, their decision\nboundary decreased as they became more confident and had a lower\ndecision-making threshold. Additionally, the old group had a higher boundary\nand lower drift rate in both pre and post-training, and there was less\ndifference between the two group parameters after training.\n","authors":["Zahra Kavian","Kimia Hajisadeghi","Yashar Rezazadeh","Mehrbod Faraji","Reza Ebrahimpour"],"pdf_url":"https://arxiv.org/pdf/2311.18376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18364v1","updated":"2023-11-30T09:03:49Z","published":"2023-11-30T09:03:49Z","title":"Hubness Reduction Improves Sentence-BERT Semantic Spaces","summary":" Semantic representations of text, i.e. representations of natural language\nwhich capture meaning by geometry, are essential for areas such as information\nretrieval and document grouping. High-dimensional trained dense vectors have\nreceived much attention in recent years as such representations. We investigate\nthe structure of semantic spaces that arise from embeddings made with\nSentence-BERT and find that the representations suffer from a well-known\nproblem in high dimensions called hubness. Hubness results in asymmetric\nneighborhood relations, such that some texts (the hubs) are neighbours of many\nother texts while most texts (so-called anti-hubs), are neighbours of few or no\nother texts. We quantify the semantic quality of the embeddings using hubness\nscores and error rate of a neighbourhood based classifier. We find that when\nhubness is high, we can reduce error rate and hubness using hubness reduction\nmethods. We identify a combination of two methods as resulting in the best\nreduction. For example, on one of the tested pretrained models, this combined\nmethod can reduce hubness by about 75% and error rate by about 9%. Thus, we\nargue that mitigating hubness in the embedding space provides better semantic\nrepresentations of text.\n","authors":["Beatrix M. G. Nielsen","Lars Kai Hansen"],"pdf_url":"https://arxiv.org/pdf/2311.18364v1.pdf","comment":"Accepted at NLDL 2024"},{"id":"http://arxiv.org/abs/2305.13172v3","updated":"2023-11-30T08:55:24Z","published":"2023-05-22T16:00:00Z","title":"Editing Large Language Models: Problems, Methods, and Opportunities","summary":" Despite the ability to train capable LLMs, the methodology for maintaining\ntheir relevancy and rectifying errors remains elusive. To this end, the past\nfew years have witnessed a surge in techniques for editing LLMs, the objective\nof which is to efficiently alter the behavior of LLMs within a specific domain\nwithout negatively impacting performance across other inputs. This paper\nembarks on a deep exploration of the problems, methods, and opportunities\nrelated to model editing for LLMs. In particular, we provide an exhaustive\noverview of the task definition and challenges associated with model editing,\nalong with an in-depth empirical analysis of the most progressive methods\ncurrently at our disposal. We also build a new benchmark dataset to facilitate\na more robust evaluation and pinpoint enduring issues intrinsic to existing\ntechniques. Our objective is to provide valuable insights into the\neffectiveness and feasibility of each editing technique, thereby assisting the\ncommunity in making informed decisions on the selection of the most appropriate\nmethod for a specific task or context. Code and datasets are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Yunzhi Yao","Peng Wang","Bozhong Tian","Siyuan Cheng","Zhoubo Li","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13172v3.pdf","comment":"EMNLP 2023. Updated with new experiments"},{"id":"http://arxiv.org/abs/2311.18356v1","updated":"2023-11-30T08:54:32Z","published":"2023-11-30T08:54:32Z","title":"Towards Comparable Active Learning","summary":" Active Learning has received significant attention in the field of machine\nlearning for its potential in selecting the most informative samples for\nlabeling, thereby reducing data annotation costs. However, we show that the\nreported lifts in recent literature generalize poorly to other domains leading\nto an inconclusive landscape in Active Learning research. Furthermore, we\nhighlight overlooked problems for reproducing AL experiments that can lead to\nunfair comparisons and increased variance in the results. This paper addresses\nthese issues by providing an Active Learning framework for a fair comparison of\nalgorithms across different tasks and domains, as well as a fast and performant\noracle algorithm for evaluation. To the best of our knowledge, we propose the\nfirst AL benchmark that tests algorithms in 3 major domains: Tabular, Image,\nand Text. We report empirical results for 6 widely used algorithms on 7\nreal-world and 2 synthetic datasets and aggregate them into a domain-specific\nranking of AL algorithms.\n","authors":["Thorben Werner","Johannes Burchert","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2311.18356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14333v2","updated":"2023-11-30T08:39:27Z","published":"2023-11-24T08:15:54Z","title":"Cycle Invariant Positional Encoding for Graph Representation Learning","summary":" Cycles are fundamental elements in graph-structured data and have\ndemonstrated their effectiveness in enhancing graph learning models. To encode\nsuch information into a graph learning framework, prior works often extract a\nsummary quantity, ranging from the number of cycles to the more sophisticated\npersistence diagram summaries. However, more detailed information, such as\nwhich edges are encoded in a cycle, has not yet been used in graph neural\nnetworks. In this paper, we make one step towards addressing this gap, and\npropose a structure encoding module, called CycleNet, that encodes cycle\ninformation via edge structure encoding in a permutation invariant manner. To\nefficiently encode the space of all cycles, we start with a cycle basis (i.e.,\na minimal set of cycles generating the cycle space) which we compute via the\nkernel of the 1-dimensional Hodge Laplacian of the input graph. To guarantee\nthe encoding is invariant w.r.t. the choice of cycle basis, we encode the cycle\ninformation via the orthogonal projector of the cycle basis, which is inspired\nby BasisNet proposed by Lim et al. We also develop a more efficient variant\nwhich however requires that the input graph has a unique shortest cycle basis.\nTo demonstrate the effectiveness of the proposed module, we provide some\ntheoretical understandings of its expressive power. Moreover, we show via a\nrange of experiments that networks enhanced by our CycleNet module perform\nbetter in various benchmarks compared to several existing SOTA models.\n","authors":["Zuoyu Yan","Tengfei Ma","Liangcai Gao","Zhi Tang","Chao Chen","Yusu Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14333v2.pdf","comment":"Accepted as oral presentation in the Learning on Graphs Conference\n (LoG 2023)"},{"id":"http://arxiv.org/abs/2311.18348v1","updated":"2023-11-30T08:34:12Z","published":"2023-11-30T08:34:12Z","title":"Reconstructing Historical Climate Fields With Deep Learning","summary":" Historical records of climate fields are often sparse due to missing\nmeasurements, especially before the introduction of large-scale satellite\nmissions. Several statistical and model-based methods have been introduced to\nfill gaps and reconstruct historical records. Here, we employ a recently\nintroduced deep-learning approach based on Fourier convolutions, trained on\nnumerical climate model output, to reconstruct historical climate fields. Using\nthis approach we are able to realistically reconstruct large and irregular\nareas of missing data, as well as reconstruct known historical events such as\nstrong El Ni\\~no and La Ni\\~na with very little given information. Our method\noutperforms the widely used statistical kriging method as well as other recent\nmachine learning approaches. The model generalizes to higher resolutions than\nthe ones it was trained on and can be used on a variety of climate fields.\nMoreover, it allows inpainting of masks never seen before during the model\ntraining.\n","authors":["Nils Bochow","Anna Poltronieri","Martin Rypdal","Niklas Boers"],"pdf_url":"https://arxiv.org/pdf/2311.18348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18341v1","updated":"2023-11-30T08:22:08Z","published":"2023-11-30T08:22:08Z","title":"Learning Robust Precipitation Forecaster by Temporal Frame Interpolation","summary":" Recent advancements in deep learning have propelled the field of weather\nprediction models to new heights. Despite their progress, these models often\nstruggle with real-world application due to their sensitivity to\nspatial-temporal shifts, a vulnerability particularly pronounced in weather\nprediction tasks where overfitting to local and temporal variations is common.\nThis paper presents an investigation into the development of a robust\nprecipitation forecasting model that stands resilient to such shifts. We\nintroduce Temporal Frame Interpolation (TFI), an innovative technique designed\nto fortify forecasting models against spatial-temporal discrepancies. TFI\noperates by generating synthetic samples through the interpolation of adjacent\nframes from satellite imagery and ground radar data, thereby enriching the\ntraining dataset and bolstering the model's defense against noise on frames.\nAdditionally, we integrate a novel multi-level dice loss, which exploits the\nordinal nature of rainfall intensities to further refine model performance.\nThese methodologies have collectively advanced our model's forecasting\nprecision, achieving \\textit{1st place} on the transfer learning leaderboard in\nthe \\textit{Weather4Cast'23 competition}.It not only demonstrates the efficacy\nof our approaches but also sets a new benchmark for deep learning applications\nin meteorological forecasting. Our code and weights have been public on\n\\url{https://github.com/Secilia-Cxy/UNetTFI}.\n","authors":["Lu Han","Xu-Yang Chen","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2311.18341v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.02968 by other authors"},{"id":"http://arxiv.org/abs/2311.03197v3","updated":"2023-11-30T07:52:31Z","published":"2023-11-06T15:39:05Z","title":"Stable Linear Subspace Identification: A Machine Learning Approach","summary":" Machine Learning (ML) and linear System Identification (SI) have been\nhistorically developed independently. In this paper, we leverage\nwell-established ML tools - especially the automatic differentiation framework\n- to introduce SIMBa, a family of discrete linear multi-step-ahead state-space\nSI methods using backpropagation. SIMBa relies on a novel\nLinear-Matrix-Inequality-based free parametrization of Schur matrices to ensure\nthe stability of the identified model.\n We show how SIMBa generally outperforms traditional linear state-space SI\nmethods, and sometimes significantly, although at the price of a higher\ncomputational burden. This performance gap is particularly remarkable compared\nto other SI methods with stability guarantees, where the gain is frequently\nabove 25% in our investigations, hinting at SIMBa's ability to simultaneously\nachieve state-of-the-art fitting performance and enforce stability.\nInterestingly, these observations hold for a wide variety of input-output\nsystems and on both simulated and real-world data, showcasing the flexibility\nof the proposed approach. We postulate that this new SI paradigm presents a\ngreat extension potential to identify structured nonlinear models from data,\nand we hence open-source SIMBa on https://github.com/Cemempamoi/simba.\n","authors":["Loris Di Natale","Muhammad Zakwan","Bratislav Svetozarevic","Philipp Heer","Giancarlo Ferrari Trecate","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2311.03197v3.pdf","comment":"Submitted to ECC 2024"},{"id":"http://arxiv.org/abs/2311.18316v1","updated":"2023-11-30T07:35:56Z","published":"2023-11-30T07:35:56Z","title":"Learning for Semantic Knowledge Base-Guided Online Feature Transmission\n in Dynamic Channels","summary":" With the proliferation of edge computing, efficient AI inference on edge\ndevices has become essential for intelligent applications such as autonomous\nvehicles and VR/AR. In this context, we address the problem of efficient remote\nobject recognition by optimizing feature transmission between mobile devices\nand edge servers. We propose an online optimization framework to address the\nchallenge of dynamic channel conditions and device mobility in an end-to-end\ncommunication system. Our approach builds upon existing methods by leveraging a\nsemantic knowledge base to drive multi-level feature transmission, accounting\nfor temporal factors and dynamic elements throughout the transmission process.\nTo solve the online optimization problem, we design a novel soft\nactor-critic-based deep reinforcement learning system with a carefully designed\nreward function for real-time decision-making, overcoming the optimization\ndifficulty of the NP-hard problem and achieving the minimization of semantic\nloss while respecting latency constraints. Numerical results showcase the\nsuperiority of our approach compared to traditional greedy methods under\nvarious system setups.\n","authors":["Xiangyu Gao","Yaping Sun","Dongyu Wei","Xiaodong Xu","Hao Chen","Hao Yin","Shuguang Cui"],"pdf_url":"https://arxiv.org/pdf/2311.18316v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2311.18313v1","updated":"2023-11-30T07:31:36Z","published":"2023-11-30T07:31:36Z","title":"Automatic Implementation of Neural Networks through Reaction Networks --\n Part I: Circuit Design and Convergence Analysis","summary":" Information processing relying on biochemical interactions in the cellular\nenvironment is essential for biological organisms. The implementation of\nmolecular computational systems holds significant interest and potential in the\nfields of synthetic biology and molecular computation. This two-part article\naims to introduce a programmable biochemical reaction network (BCRN) system\nendowed with mass action kinetics that realizes the fully connected neural\nnetwork (FCNN) and has the potential to act automatically in vivo. In part I,\nthe feedforward propagation computation, the backpropagation component, and all\nbridging processes of FCNN are ingeniously designed as specific BCRN modules\nbased on their dynamics. This approach addresses a design gap in the\nbiochemical assignment module and judgment termination module and provides a\nnovel precise and robust realization of bi-molecular reactions for the learning\nprocess. Through equilibrium approaching, we demonstrate that the designed BCRN\nsystem achieves FCNN functionality with exponential convergence to target\ncomputational results, thereby enhancing the theoretical support for such work.\nFinally, the performance of this construction is further evaluated on two\ntypical logic classification problems.\n","authors":["Yuzhen Fan","Xiaoyu Zhang","Chuanhou Gao","Denis Dochain"],"pdf_url":"https://arxiv.org/pdf/2311.18313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18307v1","updated":"2023-11-30T07:25:24Z","published":"2023-11-30T07:25:24Z","title":"Categorical Traffic Transformer: Interpretable and Diverse Behavior\n Prediction with Tokenized Latent","summary":" Adept traffic models are critical to both planning and closed-loop simulation\nfor autonomous vehicles (AV), and key design objectives include accuracy,\ndiverse multimodal behaviors, interpretability, and downstream compatibility.\nRecently, with the advent of large language models (LLMs), an additional\ndesirable feature for traffic models is LLM compatibility. We present\nCategorical Traffic Transformer (CTT), a traffic model that outputs both\ncontinuous trajectory predictions and tokenized categorical predictions (lane\nmodes, homotopies, etc.). The most outstanding feature of CTT is its fully\ninterpretable latent space, which enables direct supervision of the latent\nvariable from the ground truth during training and avoids mode collapse\ncompletely. As a result, CTT can generate diverse behaviors conditioned on\ndifferent latent modes with semantic meanings while beating SOTA on prediction\naccuracy. In addition, CTT's ability to input and output tokens enables\nintegration with LLMs for common-sense reasoning and zero-shot generalization.\n","authors":["Yuxiao Chen","Sander Tonkens","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2311.18307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18306v1","updated":"2023-11-30T07:22:55Z","published":"2023-11-30T07:22:55Z","title":"PAUNet: Precipitation Attention-based U-Net for rain prediction from\n satellite radiance data","summary":" This paper introduces Precipitation Attention-based U-Net (PAUNet), a deep\nlearning architecture for predicting precipitation from satellite radiance\ndata, addressing the challenges of the Weather4cast 2023 competition. PAUNet is\na variant of U-Net and Res-Net, designed to effectively capture the large-scale\ncontextual information of multi-band satellite images in visible, water vapor,\nand infrared bands through encoder convolutional layers with center cropping\nand attention mechanisms. We built upon the Focal Precipitation Loss including\nan exponential component (e-FPL), which further enhanced the importance across\ndifferent precipitation categories, particularly medium and heavy rain. Trained\non a substantial dataset from various European regions, PAUNet demonstrates\nnotable accuracy with a higher Critical Success Index (CSI) score than the\nbaseline model in predicting rainfall over multiple time slots. PAUNet's\narchitecture and training methodology showcase improvements in precipitation\nforecasting, crucial for sectors like emergency services and retail and supply\nchain management.\n","authors":["P. Jyoteeshkumar Reddy","Harish Baki","Sandeep Chinta","Richard Matear","John Taylor"],"pdf_url":"https://arxiv.org/pdf/2311.18306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01442v3","updated":"2023-11-30T06:51:26Z","published":"2023-11-02T17:55:41Z","title":"Deep Double Descent for Time Series Forecasting: Avoiding Undertrained\n Models","summary":" Deep learning models, particularly Transformers, have achieved impressive\nresults in various domains, including time series forecasting. While existing\ntime series literature primarily focuses on model architecture modifications\nand data augmentation techniques, this paper explores the training schema of\ndeep learning models for time series; how models are trained regardless of\ntheir architecture. We perform extensive experiments to investigate the\noccurrence of deep double descent in several Transformer models trained on\npublic time series data sets. We demonstrate epoch-wise deep double descent and\nthat overfitting can be reverted using more epochs. Leveraging these findings,\nwe achieve state-of-the-art results for long sequence time series forecasting\nin nearly 70% of the 72 benchmarks tested. This suggests that many models in\nthe literature may possess untapped potential. Additionally, we introduce a\ntaxonomy for classifying training schema modifications, covering data\naugmentation, model inputs, model targets, time series per model, and\ncomputational budget.\n","authors":["Valentino Assandri","Sam Heshmati","Burhaneddin Yaman","Anton Iakovlev","Ariel Emiliano Repetur"],"pdf_url":"https://arxiv.org/pdf/2311.01442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.03215v2","updated":"2023-11-30T06:51:24Z","published":"2022-01-10T08:47:52Z","title":"Handwriting recognition and automatic scoring for descriptive answers in\n Japanese language tests","summary":" This paper presents an experiment of automatically scoring handwritten\ndescriptive answers in the trial tests for the new Japanese university entrance\nexamination, which were made for about 120,000 examinees in 2017 and 2018.\nThere are about 400,000 answers with more than 20 million characters. Although\nall answers have been scored by human examiners, handwritten characters are not\nlabeled. We present our attempt to adapt deep neural network-based handwriting\nrecognizers trained on a labeled handwriting dataset into this unlabeled answer\nset. Our proposed method combines different training strategies, ensembles\nmultiple recognizers, and uses a language model built from a large general\ncorpus to avoid overfitting into specific data. In our experiment, the proposed\nmethod records character accuracy of over 97% using about 2,000 verified\nlabeled answers that account for less than 0.5% of the dataset. Then, the\nrecognized answers are fed into a pre-trained automatic scoring system based on\nthe BERT model without correcting misrecognized characters and providing rubric\nannotations. The automatic scoring system achieves from 0.84 to 0.98 of\nQuadratic Weighted Kappa (QWK). As QWK is over 0.8, it represents an acceptable\nsimilarity of scoring between the automatic scoring system and the human\nexaminers. These results are promising for further research on end-to-end\nautomatic scoring of descriptive answers.\n","authors":["Hung Tuan Nguyen","Cuong Tuan Nguyen","Haruki Oka","Tsunenori Ishioka","Masaki Nakagawa"],"pdf_url":"https://arxiv.org/pdf/2201.03215v2.pdf","comment":"Keywords: handwritten Japanese answers, handwriting recognition,\n automatic scoring, ensemble recognition, deep neural networks; Reported in\n IEICE technical report, PRMU2021-32, pp.45-50 (2021.12) Published after peer\n review and Presented in ICFHR2022, Lecture Notes in Computer Science, vol.\n 13639, pp. 274-284 (2022.11)"},{"id":"http://arxiv.org/abs/2012.15408v3","updated":"2023-11-30T06:40:48Z","published":"2020-12-31T02:42:27Z","title":"Gated Ensemble of Spatio-temporal Mixture of Experts for Multi-task\n Learning in Ride-hailing System","summary":" Designing spatio-temporal forecasting models separately in a task-wise and\ncity-wise manner poses a burden for the expanding transportation network\ncompanies. Therefore, a multi-task learning architecture is proposed in this\nstudy by developing gated ensemble of spatio-temporal mixture of experts\nnetwork (GESME-Net) with convolutional recurrent neural network (CRNN),\nconvolutional neural network (CNN), and recurrent neural network (RNN) for\nsimultaneously forecasting spatio-temporal tasks in a city as well as across\ndifferent cities. Furthermore, a task adaptation layer is integrated with the\narchitecture for learning joint representation in multi-task learning and\nrevealing the contribution of the input features utilized in prediction. The\nproposed architecture is tested with data from Didi Chuxing for: (i)\nsimultaneously forecasting demand and supply-demand gap in Beijing, and (ii)\nsimultaneously forecasting demand across Chengdu and Xian. In both scenarios,\nmodels from our proposed architecture outperformed the single-task and\nmulti-task deep learning benchmarks and ensemble-based machine learning\nalgorithms.\n","authors":["M. H. Rahman","S. M. Rifaat","S. N. Sadeek","M. Abrar","D. Wang"],"pdf_url":"https://arxiv.org/pdf/2012.15408v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2012.08868"},{"id":"http://arxiv.org/abs/2305.00162v2","updated":"2023-11-30T06:39:29Z","published":"2023-04-29T03:59:35Z","title":"Beyond Prediction: On-street Parking Recommendation using Heterogeneous\n Graph-based List-wise Ranking","summary":" To provide real-time parking information, existing studies focus on\npredicting parking availability, which seems an indirect approach to saving\ndrivers' cruising time. In this paper, we first time propose an on-street\nparking recommendation (OPR) task to directly recommend a parking space for a\ndriver. To this end, a learn-to-rank (LTR) based OPR model called OPR-LTR is\nbuilt. Specifically, parking recommendation is closely related to the \"turnover\nevents\" (state switching between occupied and vacant) of each parking space,\nand hence we design a highly efficient heterogeneous graph called ESGraph to\nrepresent historical and real-time meters' turnover events as well as\ngeographical relations; afterward, a convolution-based event-then-graph network\nis used to aggregate and update representations of the heterogeneous graph. A\nranking model is further utilized to learn a score function that helps\nrecommend a list of ranked parking spots for a specific on-street parking\nquery. The method is verified using the on-street parking meter data in Hong\nKong and San Francisco. By comparing with the other two types of methods:\nprediction-only and prediction-then-recommendation, the proposed\ndirect-recommendation method achieves satisfactory performance in different\nmetrics. Extensive experiments also demonstrate that the proposed ESGraph and\nthe recommendation model are more efficient in terms of computational\nefficiency as well as saving drivers' on-street parking time.\n","authors":["Hanyu Sun","Xiao Huang","Wei Ma"],"pdf_url":"https://arxiv.org/pdf/2305.00162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18274v1","updated":"2023-11-30T06:25:06Z","published":"2023-11-30T06:25:06Z","title":"Semiparametric Efficient Inference in Adaptive Experiments","summary":" We consider the problem of efficient inference of the Average Treatment\nEffect in a sequential experiment where the policy governing the assignment of\nsubjects to treatment or control can change over time. We first provide a\ncentral limit theorem for the Adaptive Augmented Inverse-Probability Weighted\nestimator, which is semiparametric efficient, under weaker assumptions than\nthose previously made in the literature. This central limit theorem enables\nefficient inference at fixed sample sizes. We then consider a sequential\ninference setting, deriving both asymptotic and nonasymptotic confidence\nsequences that are considerably tighter than previous methods. These\nanytime-valid methods enable inference under data-dependent stopping times\n(sample sizes). Additionally, we use propensity score truncation techniques\nfrom the recent off-policy estimation literature to reduce the finite sample\nvariance of our estimator without affecting the asymptotic variance. Empirical\nresults demonstrate that our methods yield narrower confidence sequences than\nthose previously developed in the literature while maintaining time-uniform\nerror control.\n","authors":["Thomas Cook","Alan Mishler","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2311.18274v1.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.12825v2","updated":"2023-11-30T06:12:38Z","published":"2023-09-30T18:08:00Z","title":"A PSO Based Method to Generate Actionable Counterfactuals for High\n Dimensional Data","summary":" Counterfactual explanations (CFE) are methods that explain a machine learning\nmodel by giving an alternate class prediction of a data point with some minimal\nchanges in its features. It helps the users to identify their data attributes\nthat caused an undesirable prediction like a loan or credit card rejection. We\ndescribe an efficient and an actionable counterfactual (CF) generation method\nbased on particle swarm optimization (PSO). We propose a simple objective\nfunction for the optimization of the instance-centric CF generation problem.\nThe PSO brings in a lot of flexibility in terms of carrying out multi-objective\noptimization in large dimensions, capability for multiple CF generation, and\nsetting box constraints or immutability of data attributes. An algorithm is\nproposed that incorporates these features and it enables greater control over\nthe proximity and sparsity properties over the generated CFs. The proposed\nalgorithm is evaluated with a set of action-ability metrics in real-world\ndatasets, and the results were superior compared to that of the\nstate-of-the-arts.\n","authors":["Shashank Shekhar","Asif Salim","Adesh Bansode","Vivaswan Jinturkar","Anirudha Nayak"],"pdf_url":"https://arxiv.org/pdf/2311.12825v2.pdf","comment":"Accepted in IEEE CSDE 2023"},{"id":"http://arxiv.org/abs/2112.05128v2","updated":"2023-11-30T06:04:52Z","published":"2021-12-09T18:58:36Z","title":"Fair Community Detection and Structure Learning in Heterogeneous\n Graphical Models","summary":" Inference of community structure in probabilistic graphical models may not be\nconsistent with fairness constraints when nodes have demographic attributes.\nCertain demographics may be over-represented in some detected communities and\nunder-represented in others. This paper defines a novel $\\ell_1$-regularized\npseudo-likelihood approach for fair graphical model selection. In particular,\nwe assume there is some community or clustering structure in the true\nunderlying graph, and we seek to learn a sparse undirected graph and its\ncommunities from the data such that demographic groups are fairly represented\nwithin the communities. In the case when the graph is known a priori, we\nprovide a convex semidefinite programming approach for fair community\ndetection. We establish the statistical consistency of the proposed method for\nboth a Gaussian graphical model and an Ising model for, respectively,\ncontinuous and binary data, proving that our method can recover the graphs and\ntheir fair communities with high probability.\n","authors":["Davoud Ataee Tarzanagh","Laura Balzano","Alfred O. Hero"],"pdf_url":"https://arxiv.org/pdf/2112.05128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18261v1","updated":"2023-11-30T05:40:55Z","published":"2023-11-30T05:40:55Z","title":"Learning Exactly Linearizable Deep Dynamics Models","summary":" Research on control using models based on machine-learning methods has now\nshifted to the practical engineering stage. Achieving high performance and\ntheoretically guaranteeing the safety of the system is critical for such\napplications. In this paper, we propose a learning method for exactly\nlinearizable dynamical models that can easily apply various control theories to\nensure stability, reliability, etc., and to provide a high degree of freedom of\nexpression. As an example, we present a design that combines simple linear\ncontrol and control barrier functions. The proposed model is employed for the\nreal-time control of an automotive engine, and the results demonstrate good\npredictive performance and stable control under constraints.\n","authors":["Ryuta Moriyasu","Masayuki Kusunoki","Kenji Kashima"],"pdf_url":"https://arxiv.org/pdf/2311.18261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18260v1","updated":"2023-11-30T05:38:34Z","published":"2023-11-30T05:38:34Z","title":"Consensus, dissensus and synergy between clinicians and specialist\n foundation models in radiology report generation","summary":" Radiology reports are an instrumental part of modern medicine, informing key\nclinical decisions such as diagnosis and treatment. The worldwide shortage of\nradiologists, however, restricts access to expert care and imposes heavy\nworkloads, contributing to avoidable errors and delays in report delivery.\nWhile recent progress in automated report generation with vision-language\nmodels offer clear potential in ameliorating the situation, the path to\nreal-world adoption has been stymied by the challenge of evaluating the\nclinical quality of AI-generated reports. In this study, we build a\nstate-of-the-art report generation system for chest radiographs, Flamingo-CXR,\nby fine-tuning a well-known vision-language foundation model on radiology data.\nTo evaluate the quality of the AI-generated reports, a group of 16 certified\nradiologists provide detailed evaluations of AI-generated and human written\nreports for chest X-rays from an intensive care setting in the United States\nand an inpatient setting in India. At least one radiologist (out of two per\ncase) preferred the AI report to the ground truth report in over 60$\\%$ of\ncases for both datasets. Amongst the subset of AI-generated reports that\ncontain errors, the most frequently cited reasons were related to the location\nand finding, whereas for human written reports, most mistakes were related to\nseverity and finding. This disparity suggested potential complementarity\nbetween our AI system and human experts, prompting us to develop an assistive\nscenario in which Flamingo-CXR generates a first-draft report, which is\nsubsequently revised by a clinician. This is the first demonstration of\nclinician-AI collaboration for report writing, and the resultant reports are\nassessed to be equivalent or preferred by at least one radiologist to reports\nwritten by experts alone in 80$\\%$ of in-patient cases and 66$\\%$ of intensive\ncare cases.\n","authors":["Ryutaro Tanno","David G. T. Barrett","Andrew Sellergren","Sumedh Ghaisas","Sumanth Dathathri","Abigail See","Johannes Welbl","Karan Singhal","Shekoofeh Azizi","Tao Tu","Mike Schaekermann","Rhys May","Roy Lee","SiWai Man","Zahra Ahmed","Sara Mahdavi","Danielle Belgrave","Vivek Natarajan","Shravya Shetty","Pushmeet Kohli","Po-Sen Huang","Alan Karthikesalingam","Ira Ktena"],"pdf_url":"https://arxiv.org/pdf/2311.18260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18257v1","updated":"2023-11-30T05:15:35Z","published":"2023-11-30T05:15:35Z","title":"Diffusion Models Without Attention","summary":" In recent advancements in high-fidelity image generation, Denoising Diffusion\nProbabilistic Models (DDPMs) have emerged as a key player. However, their\napplication at high resolutions presents significant computational challenges.\nCurrent methods, such as patchifying, expedite processes in UNet and\nTransformer architectures but at the expense of representational capacity.\nAddressing this, we introduce the Diffusion State Space Model (DiffuSSM), an\narchitecture that supplants attention mechanisms with a more scalable state\nspace model backbone. This approach effectively handles higher resolutions\nwithout resorting to global compression, thus preserving detailed image\nrepresentation throughout the diffusion process. Our focus on FLOP-efficient\narchitectures in diffusion training marks a significant step forward.\nComprehensive evaluations on both ImageNet and LSUN datasets at two resolutions\ndemonstrate that DiffuSSMs are on par or even outperform existing diffusion\nmodels with attention modules in FID and Inception Score metrics while\nsignificantly reducing total FLOP usage.\n","authors":["Jing Nathan Yan","Jiatao Gu","Alexander M. Rush"],"pdf_url":"https://arxiv.org/pdf/2311.18257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18252v1","updated":"2023-11-30T05:03:08Z","published":"2023-11-30T05:03:08Z","title":"Navigating Privacy and Copyright Challenges Across the Data Lifecycle of\n Generative AI","summary":" The advent of Generative AI has marked a significant milestone in artificial\nintelligence, demonstrating remarkable capabilities in generating realistic\nimages, texts, and data patterns. However, these advancements come with\nheightened concerns over data privacy and copyright infringement, primarily due\nto the reliance on vast datasets for model training. Traditional approaches\nlike differential privacy, machine unlearning, and data poisoning only offer\nfragmented solutions to these complex issues. Our paper delves into the\nmultifaceted challenges of privacy and copyright protection within the data\nlifecycle. We advocate for integrated approaches that combines technical\ninnovation with ethical foresight, holistically addressing these concerns by\ninvestigating and devising solutions that are informed by the lifecycle\nperspective. This work aims to catalyze a broader discussion and inspire\nconcerted efforts towards data privacy and copyright integrity in Generative\nAI.\n","authors":["Dawen Zhang","Boming Xia","Yue Liu","Xiwei Xu","Thong Hoang","Zhenchang Xing","Mark Staples","Qinghua Lu","Liming Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.18252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18246v1","updated":"2023-11-30T04:36:25Z","published":"2023-11-30T04:36:25Z","title":"Combined Scheduling, Memory Allocation and Tensor Replacement for\n Minimizing Off-Chip Data Accesses of DNN Accelerators","summary":" Specialized hardware accelerators have been extensively used for Deep Neural\nNetworks (DNNs) to provide power/performance benefits. These accelerators\ncontain specialized hardware that supports DNN operators, and scratchpad memory\nfor storing the tensor operands. Often, the size of the scratchpad is\ninsufficient to store all the tensors needed for the computation, and\nadditional data accesses are needed to move tensors back and forth from host\nmemory during the computation with significant power/performance overhead. The\nvolume of these additional data accesses depends on the operator schedule, and\nmemory allocation (specific locations selected for the tensors in the\nscratchpad). We propose an optimization framework, named COSMA, for mapping\nDNNs to an accelerator that finds the optimal operator schedule, memory\nallocation and tensor replacement that minimizes the additional data accesses.\nCOSMA provides an Integer Linear Programming (ILP) formulation to generate the\noptimal solution for mapping a DNN to the accelerator for a given scratchpad\nsize. We demonstrate that, using an off-the-shelf ILP solver, COSMA obtains the\noptimal solution in seconds for a wide-range of state-of-the-art DNNs for\ndifferent applications. Further, it out-performs existing methods by reducing\non average 84% of the non-compulsory data accesses. We further propose a\ndivide-and-conquer heuristic to scale up to certain complex DNNs generated by\nNeural Architecture Search, and this heuristic solution reduces on average 85%\ndata accesses compared with other works.\n","authors":["Yi Li","Aarti Gupta","Sharad Malik"],"pdf_url":"https://arxiv.org/pdf/2311.18246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18244v1","updated":"2023-11-30T04:25:28Z","published":"2023-11-30T04:25:28Z","title":"Poisoning Attacks Against Contrastive Recommender Systems","summary":" Contrastive learning (CL) has recently gained significant popularity in the\nfield of recommendation. Its ability to learn without heavy reliance on labeled\ndata is a natural antidote to the data sparsity issue. Previous research has\nfound that CL can not only enhance recommendation accuracy but also\ninadvertently exhibit remarkable robustness against noise. However, this paper\nidentifies a vulnerability of CL-based recommender systems: Compared with their\nnon-CL counterparts, they are even more susceptible to poisoning attacks that\naim to promote target items. Our analysis points to the uniform dispersion of\nrepresentations led by the CL loss as the very factor that accounts for this\nvulnerability. We further theoretically and empirically demonstrate that the\noptimization of CL loss can lead to smooth spectral values of representations.\nBased on these insights, we attempt to reveal the potential poisoning attacks\nagainst CL-based recommender systems. The proposed attack encompasses a\ndual-objective framework: One that induces a smoother spectral value\ndistribution to amplify the CL loss's inherent dispersion effect, named\ndispersion promotion; and the other that directly elevates the visibility of\ntarget items, named rank promotion. We validate the destructiveness of our\nattack model through extensive experimentation on four datasets. By shedding\nlight on these vulnerabilities, we aim to facilitate the development of more\nrobust CL-based recommender systems.\n","authors":["Zongwei Wang","Junliang Yu","Min Gao","Hongzhi Yin","Bin Cui","Shazia Sadiq"],"pdf_url":"https://arxiv.org/pdf/2311.18244v1.pdf","comment":"14pages,6 figures,5 tables"},{"id":"http://arxiv.org/abs/2311.18243v1","updated":"2023-11-30T04:21:10Z","published":"2023-11-30T04:21:10Z","title":"DKiS: Decay weight invertible image steganography with private key","summary":" Image steganography, the practice of concealing information within another\nimage, traditionally faces security challenges when its methods become publicly\nknown. To counteract this, we introduce a novel private key-based image\nsteganography technique. This approach ensures the security of hidden\ninformation, requiring a corresponding private key for access, irrespective of\nthe public knowledge of the steganography method. We present experimental\nevidence demonstrating our method's effectiveness, showcasing its real-world\napplicability. Additionally, we identified a critical challenge in the\ninvertible image steganography process: the transfer of non-essential, or\n`garbage', information from the secret to the host pipeline. To address this,\nwe introduced the decay weight to control the information transfer, filtering\nout irrelevant data and enhancing the performance of image steganography. Our\ncode is publicly accessible at https://github.com/yanghangAI/DKiS, and a\npractical demonstration is available at http://yanghang.site/hidekey.\n","authors":["Hang Yang","Yitian Xu","Xuhua Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18237v1","updated":"2023-11-30T04:07:44Z","published":"2023-11-30T04:07:44Z","title":"Label-efficient Training of Small Task-specific Models by Leveraging\n Vision Foundation Models","summary":" Large Vision Foundation Models (VFMs) pretrained on massive datasets exhibit\nimpressive performance on various downstream tasks, especially with limited\nlabeled target data. However, due to their high memory and compute\nrequirements, these models cannot be deployed in resource constrained settings.\nThis raises an important question: How can we utilize the knowledge from a\nlarge VFM to train a small task-specific model for a new target task with\nlimited labeled training data? In this work, we answer this question by\nproposing a simple and highly effective task-oriented knowledge transfer\napproach to leverage pretrained VFMs for effective training of small\ntask-specific models. Our experimental results on four target tasks under\nlimited labeled data settings show that the proposed knowledge transfer\napproach outperforms task-agnostic VFM distillation, web-scale CLIP pretraining\nand supervised ImageNet pretraining by 1-10.5%, 2-22% and 2-14%, respectively.\nWe also show that the dataset used for transferring knowledge has a significant\neffect on the final target task performance, and propose an image\nretrieval-based approach for curating effective transfer sets.\n","authors":["Raviteja Vemulapalli","Hadi Pouransari","Fartash Faghri","Sachin Mehta","Mehrdad Farajtabar","Mohammad Rastegari","Oncel Tuzel"],"pdf_url":"https://arxiv.org/pdf/2311.18237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18232v1","updated":"2023-11-30T03:59:31Z","published":"2023-11-30T03:59:31Z","title":"LMRL Gym: Benchmarks for Multi-Turn Reinforcement Learning with Language\n Models","summary":" Large language models (LLMs) provide excellent text-generation capabilities,\nbut standard prompting and generation methods generally do not lead to\nintentional or goal-directed agents and might necessitate considerable prompt\ntuning. This becomes particularly apparent in multi-turn conversations: even\nthe best current LLMs rarely ask clarifying questions, engage in explicit\ninformation gathering, or take actions now that lead to better decisions after\nmultiple turns. Reinforcement learning has the potential to leverage the\npowerful modeling capabilities of LLMs, as well as their internal\nrepresentation of textual interactions, to create capable goal-directed\nlanguage agents. This can enable intentional and temporally extended\ninteractions, such as with humans, through coordinated persuasion and carefully\ncrafted questions, or in goal-directed play through text games to bring about\ndesired final outcomes. However, enabling this requires the community to\ndevelop stable and reliable reinforcement learning algorithms that can\neffectively train LLMs. Developing such algorithms requires tasks that can\ngauge progress on algorithm design, provide accessible and reproducible\nevaluations for multi-turn interactions, and cover a range of task properties\nand challenges in improving reinforcement learning algorithms. Our paper\nintroduces the LMRL-Gym benchmark for evaluating multi-turn RL for LLMs,\ntogether with an open-source research framework containing a basic toolkit for\ngetting started on multi-turn RL with offline value-based and policy-based RL\nmethods. Our benchmark consists of 8 different language tasks, which require\nmultiple rounds of language interaction and cover a range of tasks in\nopen-ended dialogue and text games.\n","authors":["Marwa Abdulhai","Isadora White","Charlie Snell","Charles Sun","Joey Hong","Yuexiang Zhai","Kelvin Xu","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2311.18232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16487v2","updated":"2023-11-30T03:56:06Z","published":"2023-11-28T04:34:04Z","title":"On the Robustness of Decision-Focused Learning","summary":" Decision-Focused Learning (DFL) is an emerging learning paradigm that tackles\nthe task of training a machine learning (ML) model to predict missing\nparameters of an incomplete optimization problem, where the missing parameters\nare predicted. DFL trains an ML model in an end-to-end system, by integrating\nthe prediction and optimization tasks, providing better alignment of the\ntraining and testing objectives. DFL has shown a lot of promise and holds the\ncapacity to revolutionize decision-making in many real-world applications.\nHowever, very little is known about the performance of these models under\nadversarial attacks. We adopt ten unique DFL methods and benchmark their\nperformance under two distinctly focused attacks adapted towards the\nPredict-then-Optimize problem setting. Our study proposes the hypothesis that\nthe robustness of a model is highly correlated with its ability to find\npredictions that lead to optimal decisions without deviating from the\nground-truth label. Furthermore, we provide insight into how to target the\nmodels that violate this condition and show how these models respond\ndifferently depending on the achieved optimality at the end of their training\ncycles.\n","authors":["Yehya Farhat"],"pdf_url":"https://arxiv.org/pdf/2311.16487v2.pdf","comment":"17 pages, 45 figures, submitted to AAAI artificial intelligence for\n operations research workshop"},{"id":"http://arxiv.org/abs/2311.17410v2","updated":"2023-11-30T03:48:24Z","published":"2023-11-29T07:30:32Z","title":"GNNFlow: A Distributed Framework for Continuous Temporal GNN Learning on\n Dynamic Graphs","summary":" Graph Neural Networks (GNNs) play a crucial role in various fields. However,\nmost existing deep graph learning frameworks assume pre-stored static graphs\nand do not support training on graph streams. In contrast, many real-world\ngraphs are dynamic and contain time domain information. We introduce GNNFlow, a\ndistributed framework that enables efficient continuous temporal graph\nrepresentation learning on dynamic graphs on multi-GPU machines. GNNFlow\nintroduces an adaptive time-indexed block-based data structure that effectively\nbalances memory usage with graph update and sampling operation efficiency. It\nfeatures a hybrid GPU-CPU graph data placement for rapid GPU-based temporal\nneighborhood sampling and kernel optimizations for enhanced sampling processes.\nA dynamic GPU cache for node and edge features is developed to maximize cache\nhit rates through reuse and restoration strategies. GNNFlow supports\ndistributed training across multiple machines with static scheduling to ensure\nload balance. We implement GNNFlow based on DGL and PyTorch. Our experimental\nresults show that GNNFlow provides up to 21.1x faster continuous learning than\nexisting systems.\n","authors":["Yuchen Zhong","Guangming Sheng","Tianzuo Qin","Minjie Wang","Quan Gan","Chuan Wu"],"pdf_url":"https://arxiv.org/pdf/2311.17410v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12230v2","updated":"2023-11-30T03:44:26Z","published":"2023-06-21T12:43:55Z","title":"Fantastic Weights and How to Find Them: Where to Prune in Dynamic Sparse\n Training","summary":" Dynamic Sparse Training (DST) is a rapidly evolving area of research that\nseeks to optimize the sparse initialization of a neural network by adapting its\ntopology during training. It has been shown that under specific conditions, DST\nis able to outperform dense models. The key components of this framework are\nthe pruning and growing criteria, which are repeatedly applied during the\ntraining process to adjust the network's sparse connectivity. While the growing\ncriterion's impact on DST performance is relatively well studied, the influence\nof the pruning criterion remains overlooked. To address this issue, we design\nand perform an extensive empirical analysis of various pruning criteria to\nbetter understand their impact on the dynamics of DST solutions. Surprisingly,\nwe find that most of the studied methods yield similar results. The differences\nbecome more significant in the low-density regime, where the best performance\nis predominantly given by the simplest technique: magnitude-based pruning. The\ncode is provided at https://github.com/alooow/fantastic_weights_paper\n","authors":["Aleksandra I. Nowak","Bram Grooten","Decebal Constantin Mocanu","Jacek Tabor"],"pdf_url":"https://arxiv.org/pdf/2306.12230v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2009.09213v5","updated":"2023-11-30T03:37:22Z","published":"2020-09-19T11:26:01Z","title":"Dodging DeepFake Detection via Implicit Spatial-Domain Notch Filtering","summary":" The current high-fidelity generation and high-precision detection of DeepFake\nimages are at an arms race. We believe that producing DeepFakes that are highly\nrealistic and 'detection evasive' can serve the ultimate goal of improving\nfuture generation DeepFake detection capabilities. In this paper, we propose a\nsimple yet powerful pipeline to reduce the artifact patterns of fake images\nwithout hurting image quality by performing implicit spatial-domain notch\nfiltering. We first demonstrate that frequency-domain notch filtering, although\nfamously shown to be effective in removing periodic noise in the spatial\ndomain, is infeasible for our task at hand due to the manual designs required\nfor the notch filters. We, therefore, resort to a learning-based approach to\nreproduce the notch filtering effects, but solely in the spatial domain. We\nadopt a combination of adding overwhelming spatial noise for breaking the\nperiodic noise pattern and deep image filtering to reconstruct the noise-free\nfake images, and we name our method DeepNotch. Deep image filtering provides a\nspecialized filter for each pixel in the noisy image, producing filtered images\nwith high fidelity compared to their DeepFake counterparts. Moreover, we also\nuse the semantic information of the image to generate an adversarial guidance\nmap to add noise intelligently. Our large-scale evaluation on 3 representative\nstate-of-the-art DeepFake detection methods (tested on 16 types of DeepFakes)\nhas demonstrated that our technique significantly reduces the accuracy of these\n3 fake image detection methods, 36.79% on average and up to 97.02% in the best\ncase.\n","authors":["Yihao Huang","Felix Juefei-Xu","Qing Guo","Yang Liu","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2009.09213v5.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2311.18224v1","updated":"2023-11-30T03:36:19Z","published":"2023-11-30T03:36:19Z","title":"Reasoning with the Theory of Mind for Pragmatic Semantic Communication","summary":" In this paper, a pragmatic semantic communication framework that enables\neffective goal-oriented information sharing between two-intelligent agents is\nproposed. In particular, semantics is defined as the causal state that\nencapsulates the fundamental causal relationships and dependencies among\ndifferent features extracted from data. The proposed framework leverages the\nemerging concept in machine learning (ML) called theory of mind (ToM). It\nemploys a dynamic two-level (wireless and semantic) feedback mechanism to\ncontinuously fine-tune neural network components at the transmitter. Thanks to\nthe ToM, the transmitter mimics the actual mental state of the receiver's\nreasoning neural network operating semantic interpretation. Then, the estimated\nmental state at the receiver is dynamically updated thanks to the proposed\ndynamic two-level feedback mechanism. At the lower level, conventional channel\nquality metrics are used to optimize the channel encoding process based on the\nwireless communication channel's quality, ensuring an efficient mapping of\nsemantic representations to a finite constellation. Additionally, a semantic\nfeedback level is introduced, providing information on the receiver's perceived\nsemantic effectiveness with minimal overhead. Numerical evaluations demonstrate\nthe framework's ability to achieve efficient communication with a reduced\namount of bits while maintaining the same semantics, outperforming conventional\nsystems that do not exploit the ToM-based reasoning.\n","authors":["Christo Kurisummoottil Thomas","Emilio Calvanese Strinati","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2311.18224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09485v2","updated":"2023-11-30T03:19:03Z","published":"2023-10-14T04:17:00Z","title":"Applying Bayesian Ridge Regression AI Modeling in Virus Severity\n Prediction","summary":" Artificial intelligence (AI) is a powerful tool for reshaping healthcare\nsystems. In healthcare, AI is invaluable for its capacity to manage vast\namounts of data, which can lead to more accurate and speedy diagnoses,\nultimately easing the workload on healthcare professionals. As a result, AI has\nproven itself to be a power tool across various industries, simplifying complex\ntasks and pattern recognition that would otherwise be overwhelming for humans\nor traditional computer algorithms. In this paper, we review the strengths and\nweaknesses of Bayesian Ridge Regression, an AI model that can be used to bring\ncutting edge virus analysis to healthcare professionals around the world. The\nmodel's accuracy assessment revealed promising results, with room for\nimprovement primarily related to data organization. In addition, the severity\nindex serves as a valuable tool to gain a broad overview of patient care needs,\naligning with healthcare professionals' preference for broader categorizations.\n","authors":["Jai Pal","Bryan Hong"],"pdf_url":"https://arxiv.org/pdf/2310.09485v2.pdf","comment":"7 pages, 2 figures, 5 listings"},{"id":"http://arxiv.org/abs/2311.18208v1","updated":"2023-11-30T03:05:14Z","published":"2023-11-30T03:05:14Z","title":"SMaRt: Improving GANs with Score Matching Regularity","summary":" Generative adversarial networks (GANs) usually struggle in learning from\nhighly diverse data, whose underlying manifold is complex. In this work, we\nrevisit the mathematical foundations of GANs, and theoretically reveal that the\nnative adversarial loss for GAN training is insufficient to fix the problem of\nsubsets with positive Lebesgue measure of the generated data manifold lying out\nof the real data manifold. Instead, we find that score matching serves as a\nvalid solution to this issue thanks to its capability of persistently pushing\nthe generated data points towards the real data manifold. We thereby propose to\nimprove the optimization of GANs with score matching regularity (SMaRt).\nRegarding the empirical evidences, we first design a toy example to show that\ntraining GANs by the aid of a ground-truth score function can help reproduce\nthe real data distribution more accurately, and then confirm that our approach\ncan consistently boost the synthesis performance of various state-of-the-art\nGANs on real-world datasets with pre-trained diffusion models acting as the\napproximate score function. For instance, when training Aurora on the ImageNet\n64x64 dataset, we manage to improve FID from 8.87 to 7.11, on par with the\nperformance of one-step consistency model. The source code will be made public.\n","authors":["Mengfei Xia","Yujun Shen","Ceyuan Yang","Ran Yi","Wenping Wang","Yong-jin Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13236v2","updated":"2023-11-30T03:01:55Z","published":"2023-05-22T17:10:44Z","title":"ADA-GP: Accelerating DNN Training By Adaptive Gradient Prediction","summary":" Neural network training is inherently sequential where the layers finish the\nforward propagation in succession, followed by the calculation and\nback-propagation of gradients (based on a loss function) starting from the last\nlayer. The sequential computations significantly slow down neural network\ntraining, especially the deeper ones. Prediction has been successfully used in\nmany areas of computer architecture to speed up sequential processing.\nTherefore, we propose ADA-GP, which uses gradient prediction adaptively to\nspeed up deep neural network (DNN) training while maintaining accuracy. ADA-GP\nworks by incorporating a small neural network to predict gradients for\ndifferent layers of a DNN model. ADA-GP uses a novel tensor reorganization\nmethod to make it feasible to predict a large number of gradients. ADA-GP\nalternates between DNN training using backpropagated gradients and DNN training\nusing predicted gradients. ADA-GP adaptively adjusts when and for how long\ngradient prediction is used to strike a balance between accuracy and\nperformance. Last but not least, we provide a detailed hardware extension in a\ntypical DNN accelerator to realize the speed up potential from gradient\nprediction. Our extensive experiments with fifteen DNN models show that ADA-GP\ncan achieve an average speed up of 1.47X with similar or even higher accuracy\nthan the baseline models. Moreover, it consumes, on average, 34% less energy\ndue to reduced off-chip memory accesses compared to the baseline accelerator.\n","authors":["Vahid Janfaza","Shantanu Mandal","Farabi Mahmud","Abdullah Muzahid"],"pdf_url":"https://arxiv.org/pdf/2305.13236v2.pdf","comment":"13 pages, 21 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.18207v1","updated":"2023-11-30T02:56:49Z","published":"2023-11-30T02:56:49Z","title":"Towards Assessing and Benchmarking Risk-Return Tradeoff of Off-Policy\n Evaluation","summary":" Off-Policy Evaluation (OPE) aims to assess the effectiveness of\ncounterfactual policies using only offline logged data and is often used to\nidentify the top-k promising policies for deployment in online A/B tests.\nExisting evaluation metrics for OPE estimators primarily focus on the\n\"accuracy\" of OPE or that of downstream policy selection, neglecting\nrisk-return tradeoff in the subsequent online policy deployment. To address\nthis issue, we draw inspiration from portfolio evaluation in finance and\ndevelop a new metric, called SharpeRatio@k, which measures the risk-return\ntradeoff of policy portfolios formed by an OPE estimator under varying online\nevaluation budgets (k). We validate our metric in two example scenarios,\ndemonstrating its ability to effectively distinguish between low-risk and\nhigh-risk estimators and to accurately identify the most efficient estimator.\nThis efficient estimator is characterized by its capability to form the most\nadvantageous policy portfolios, maximizing returns while minimizing risks\nduring online deployment, a nuance that existing metrics typically overlook. To\nfacilitate a quick, accurate, and consistent evaluation of OPE via\nSharpeRatio@k, we have also integrated this metric into an open-source\nsoftware, SCOPE-RL. Employing SharpeRatio@k and SCOPE-RL, we conduct\ncomprehensive benchmarking experiments on various estimators and RL tasks,\nfocusing on their risk-return tradeoff. These experiments offer several\ninteresting directions and suggestions for future OPE research.\n","authors":["Haruka Kiyohara","Ren Kishimoto","Kosuke Kawakami","Ken Kobayashi","Kazuhide Nakata","Yuta Saito"],"pdf_url":"https://arxiv.org/pdf/2311.18207v1.pdf","comment":"preprint, under review"},{"id":"http://arxiv.org/abs/2311.18206v1","updated":"2023-11-30T02:56:43Z","published":"2023-11-30T02:56:43Z","title":"SCOPE-RL: A Python Library for Offline Reinforcement Learning and\n Off-Policy Evaluation","summary":" This paper introduces SCOPE-RL, a comprehensive open-source Python software\ndesigned for offline reinforcement learning (offline RL), off-policy evaluation\n(OPE), and selection (OPS). Unlike most existing libraries that focus solely on\neither policy learning or evaluation, SCOPE-RL seamlessly integrates these two\nkey aspects, facilitating flexible and complete implementations of both offline\nRL and OPE processes. SCOPE-RL put particular emphasis on its OPE modules,\noffering a range of OPE estimators and robust evaluation-of-OPE protocols. This\napproach enables more in-depth and reliable OPE compared to other packages. For\ninstance, SCOPE-RL enhances OPE by estimating the entire reward distribution\nunder a policy rather than its mere point-wise expected value. Additionally,\nSCOPE-RL provides a more thorough evaluation-of-OPE by presenting the\nrisk-return tradeoff in OPE results, extending beyond mere accuracy evaluations\nin existing OPE literature. SCOPE-RL is designed with user accessibility in\nmind. Its user-friendly APIs, comprehensive documentation, and a variety of\neasy-to-follow examples assist researchers and practitioners in efficiently\nimplementing and experimenting with various offline RL methods and OPE\nestimators, tailored to their specific problem contexts. The documentation of\nSCOPE-RL is available at https://scope-rl.readthedocs.io/en/latest/.\n","authors":["Haruka Kiyohara","Ren Kishimoto","Kosuke Kawakami","Ken Kobayashi","Kazuhide Nakata","Yuta Saito"],"pdf_url":"https://arxiv.org/pdf/2311.18206v1.pdf","comment":"preprint, open-source software:\n https://github.com/hakuhodo-technologies/scope-rl"},{"id":"http://arxiv.org/abs/2308.09952v3","updated":"2023-11-30T02:56:01Z","published":"2023-08-19T09:12:47Z","title":"Finding emergence in data by maximizing effective information","summary":" Quantifying emergence and modeling emergent dynamics in a data-driven manner\nfor complex dynamical systems is challenging due to the lack of direct\nobservations at the micro-level. Thus, it's crucial to develop a framework to\nidentify emergent phenomena and capture emergent dynamics at the macro-level\nusing available data. Inspired by the theory of causal emergence (CE), this\npaper introduces a machine learning framework to learn macro-dynamics in an\nemergent latent space and quantify the degree of CE. The framework maximizes\neffective information, resulting in a macro-dynamics model with enhanced causal\neffects. Experimental results on simulated and real data demonstrate the\neffectiveness of the proposed framework. It quantifies degrees of CE\neffectively under various conditions and reveals distinct influences of\ndifferent noise types. It can learn a one-dimensional coarse-grained\nmacro-state from fMRI data, to represent complex neural activities during movie\nclip viewing. Furthermore, improved generalization to different test\nenvironments is observed across all simulation data.\n","authors":["Mingzhe Yang","Zhipeng Wang","Kaiwei Liu","Yingqi Rong","Bing Yuan","Jiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09952v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18194v1","updated":"2023-11-30T02:26:55Z","published":"2023-11-30T02:26:55Z","title":"Positional Information Matters for Invariant In-Context Learning: A Case\n Study of Simple Function Classes","summary":" In-context learning (ICL) refers to the ability of a model to condition on a\nfew in-context demonstrations (input-output examples of the underlying task) to\ngenerate the answer for a new query input, without updating parameters. Despite\nthe impressive ICL ability of LLMs, it has also been found that ICL in LLMs is\nsensitive to input demonstrations and limited to short context lengths. To\nunderstand the limitations and principles for successful ICL, we conduct an\ninvestigation with ICL linear regression of transformers. We characterize\nseveral Out-of-Distribution (OOD) cases for ICL inspired by realistic LLM ICL\nfailures and compare transformers with DeepSet, a simple yet powerful\narchitecture for ICL. Surprisingly, DeepSet outperforms transformers across a\nvariety of distribution shifts, implying that preserving permutation invariance\nsymmetry to input demonstrations is crucial for OOD ICL. The phenomenon\nspecifies a fundamental requirement by ICL, which we termed as ICL invariance.\nNevertheless, the positional encodings in LLMs will break ICL invariance. To\nthis end, we further evaluate transformers with identical positional encodings\nand find preserving ICL invariance in transformers achieves state-of-the-art\nperformance across various ICL distribution shifts\n","authors":["Yongqiang Chen","Binghui Xie","Kaiwen Zhou","Bo Han","Yatao Bian","James Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.18194v1.pdf","comment":"Ongoing work; preliminary version"},{"id":"http://arxiv.org/abs/2311.18190v1","updated":"2023-11-30T02:19:35Z","published":"2023-11-30T02:19:35Z","title":"Toward the Tradeoffs between Privacy, Fairness and Utility in Federated\n Learning","summary":" Federated Learning (FL) is a novel privacy-protection distributed machine\nlearning paradigm that guarantees user privacy and prevents the risk of data\nleakage due to the advantage of the client's local training. Researchers have\nstruggled to design fair FL systems that ensure fairness of results. However,\nthe interplay between fairness and privacy has been less studied. Increasing\nthe fairness of FL systems can have an impact on user privacy, while an\nincrease in user privacy can affect fairness. In this work, on the client side,\nwe use fairness metrics, such as Demographic Parity (DemP), Equalized Odds\n(EOs), and Disparate Impact (DI), to construct the local fair model. To protect\nthe privacy of the client model, we propose a privacy-protection fairness FL\nmethod. The results show that the accuracy of the fair model with privacy\nincreases because privacy breaks the constraints of the fairness metrics. In\nour experiments, we conclude the relationship between privacy, fairness and\nutility, and there is a tradeoff between these.\n","authors":["Kangkang Sun","Xiaojin Zhang","Xi Lin","Gaolei Li","Jing Wang","Jianhua Li"],"pdf_url":"https://arxiv.org/pdf/2311.18190v1.pdf","comment":"17 pages, 3 figures, conference"},{"id":"http://arxiv.org/abs/2310.00533v3","updated":"2023-11-30T02:18:10Z","published":"2023-10-01T00:52:24Z","title":"SELF: Language-Driven Self-Evolution for Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable versatility across\nvarious domains. To further advance LLMs, we propose 'SELF' (Self-Evolution\nwith Language Feedback), a novel approach that enables LLMs to self-improve\nthrough self-reflection, akin to human learning processes. SELF initiates with\na meta-skill learning process that equips the LLMs with capabilities for\nself-feedback and self-refinement. Subsequently, the model undergoes an\niterative process of self-evolution. In each iteration, it utilizes an\nunlabeled dataset of instructions to generate initial responses. These\nresponses are enhanced through self-feedback and self-refinement. The model is\nthen fine-tuned using this enhanced data. The model undergoes progressive\nimprovement through this iterative self-evolution process. Moreover, the SELF\nframework enables the model to apply self-refinement during inference, which\nfurther improves response quality. Our experiments in mathematics and general\ntasks demonstrate that SELF can enhance the capabilities of LLMs without human\nintervention. The SELF framework indicates a promising direction for the\nautonomous evolution of LLMs, transitioning them from passive information\nreceivers to active participants in their development.\n","authors":["Jianqiao Lu","Wanjun Zhong","Wenyong Huang","Yufei Wang","Fei Mi","Baojun Wang","Weichao Wang","Lifeng Shang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00533v3.pdf","comment":"17 pages, 4 figures, 11 tables"},{"id":"http://arxiv.org/abs/2311.18188v1","updated":"2023-11-30T02:15:07Z","published":"2023-11-30T02:15:07Z","title":"Leveraging cache to enable SLU on tiny devices","summary":" This paper addresses spoken language understanding (SLU) on\nmicrocontroller-like embedded devices, integrating on-device execution with\ncloud offloading in a novel fashion. We exploit temporal locality in a device's\nspeech inputs and accordingly reuse recent SLU inferences. Our idea is simple:\nlet the device match new inputs against cached results, and only offload\nunmatched inputs to the cloud for full inference. Realization of this idea,\nhowever, is non-trivial: the device needs to compare acoustic features in a\nrobust, low-cost way. To this end, we present XYZ, a speech cache for tiny\ndevices. It matches speech inputs at two levels of representations: first by\nclustered sequences of raw sound units, then as sequences of phonemes. Working\nin tandem, the two representations offer complementary cost/accuracy tradeoffs.\nTo further boost accuracy, our cache is learning: with the mismatched and then\noffloaded inputs, it continuously finetunes the device's feature extractors\n(with the assistance of the cloud). We implement XYZ on an off-the-shelf STM32\nmicrocontroller. The resultant implementation has a small memory footprint of\n2MB. Evaluated on challenging speech benchmarks, our system resolves 45%--90%\nof inputs on device, reducing the average latency by up to 80% compared to\noffloading to popular cloud speech services. Our benefit is pronounced even in\nadversarial settings -- noisy environments, cold cache, or one device shared by\na number of users.\n","authors":["Afsara Benazir","Zhiming Xu","Felix Xiaozhu Lin"],"pdf_url":"https://arxiv.org/pdf/2311.18188v1.pdf","comment":"submitted to Mobisys 2024"},{"id":"http://arxiv.org/abs/2311.17400v2","updated":"2023-11-30T02:08:24Z","published":"2023-11-29T07:09:13Z","title":"Improving the Robustness of Transformer-based Large Language Models with\n Dynamic Attention","summary":" Transformer-based models, such as BERT and GPT, have been widely adopted in\nnatural language processing (NLP) due to their exceptional performance.\nHowever, recent studies show their vulnerability to textual adversarial attacks\nwhere the model's output can be misled by intentionally manipulating the text\ninputs. Despite various methods that have been proposed to enhance the model's\nrobustness and mitigate this vulnerability, many require heavy consumption\nresources (e.g., adversarial training) or only provide limited protection\n(e.g., defensive dropout). In this paper, we propose a novel method called\ndynamic attention, tailored for the transformer architecture, to enhance the\ninherent robustness of the model itself against various adversarial attacks.\nOur method requires no downstream task knowledge and does not incur additional\ncosts. The proposed dynamic attention consists of two modules: (I) attention\nrectification, which masks or weakens the attention value of the chosen tokens,\nand (ii) dynamic modeling, which dynamically builds the set of candidate\ntokens. Extensive experiments demonstrate that dynamic attention significantly\nmitigates the impact of adversarial attacks, improving up to 33\\% better\nperformance than previous methods against widely-used adversarial attacks. The\nmodel-level design of dynamic attention enables it to be easily combined with\nother defense methods (e.g., adversarial training) to further enhance the\nmodel's robustness. Furthermore, we demonstrate that dynamic attention\npreserves the state-of-the-art robustness space of the original model compared\nto other dynamic modeling methods.\n","authors":["Lujia Shen","Yuwen Pu","Shouling Ji","Changjiang Li","Xuhong Zhang","Chunpeng Ge","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17134v2","updated":"2023-11-30T02:06:29Z","published":"2023-11-28T18:51:19Z","title":"GlycoNMR: Dataset and benchmarks for NMR chemical shift prediction of\n carbohydrates with graph neural networks","summary":" Molecular representation learning (MRL) is a powerful tool for bridging the\ngap between machine learning and chemical sciences, as it converts molecules\ninto numerical representations while preserving their chemical features. These\nencoded representations serve as a foundation for various downstream\nbiochemical studies, including property prediction and drug design. MRL has had\ngreat success with proteins and general biomolecule datasets. Yet, in the\ngrowing sub-field of glycoscience (the study of carbohydrates, where longer\ncarbohydrates are also called glycans), MRL methods have been barely explored.\nThis under-exploration can be primarily attributed to the limited availability\nof comprehensive and well-curated carbohydrate-specific datasets and a lack of\nMachine learning (ML) pipelines specifically tailored to meet the unique\nproblems presented by carbohydrate data. Since interpreting and annotating\ncarbohydrate-specific data is generally more complicated than protein data,\ndomain experts are usually required to get involved. The existing MRL methods,\npredominately optimized for proteins and small biomolecules, also cannot be\ndirectly used in carbohydrate applications without special modifications. To\naddress this challenge, accelerate progress in glycoscience, and enrich the\ndata resources of the MRL community, we introduce GlycoNMR. GlycoNMR contains\ntwo laboriously curated datasets with 2,609 carbohydrate structures and 211,543\nannotated nuclear magnetic resonance (NMR) chemical shifts for precise\natomic-level prediction. We tailored carbohydrate-specific features and adapted\nexisting MRL models to tackle this problem effectively. For illustration, we\nbenchmark four modified MRL models on our new datasets.\n","authors":["Zizhang Chen","Ryan Paul Badman","Lachele Foley","Robert Woods","Pengyu Hong"],"pdf_url":"https://arxiv.org/pdf/2311.17134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14948v2","updated":"2023-11-30T01:58:33Z","published":"2023-11-25T06:55:13Z","title":"Effective Backdoor Mitigation Depends on the Pre-training Objective","summary":" Despite the advanced capabilities of contemporary machine learning (ML)\nmodels, they remain vulnerable to adversarial and backdoor attacks. This\nvulnerability is particularly concerning in real-world deployments, where\ncompromised models may exhibit unpredictable behavior in critical scenarios.\nSuch risks are heightened by the prevalent practice of collecting massive,\ninternet-sourced datasets for pre-training multimodal models, as these datasets\nmay harbor backdoors. Various techniques have been proposed to mitigate the\neffects of backdooring in these models such as CleanCLIP which is the current\nstate-of-the-art approach. In this work, we demonstrate that the efficacy of\nCleanCLIP in mitigating backdoors is highly dependent on the particular\nobjective used during model pre-training. We observe that stronger pre-training\nobjectives correlate with harder to remove backdoors behaviors. We show this by\ntraining multimodal models on two large datasets consisting of 3 million (CC3M)\nand 6 million (CC6M) datapoints, under various pre-training objectives,\nfollowed by poison removal using CleanCLIP. We find that CleanCLIP is\nineffective when stronger pre-training objectives are used, even with extensive\nhyperparameter tuning. Our findings underscore critical considerations for ML\npractitioners who pre-train models using large-scale web-curated data and are\nconcerned about potential backdoor threats. Notably, our results suggest that\nsimpler pre-training objectives are more amenable to effective backdoor\nremoval. This insight is pivotal for practitioners seeking to balance the\ntrade-offs between using stronger pre-training objectives and security against\nbackdoor attacks.\n","authors":["Sahil Verma","Gantavya Bhatt","Avi Schwarzschild","Soumye Singhal","Arnav Mohanty Das","Chirag Shah","John P Dickerson","Jeff Bilmes"],"pdf_url":"https://arxiv.org/pdf/2311.14948v2.pdf","comment":"Accepted for oral presentation at BUGS workshop @ NeurIPS 2023\n (https://neurips2023-bugs.github.io/)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.18837v1","updated":"2023-11-30T18:59:52Z","published":"2023-11-30T18:59:52Z","title":"VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion\n Models","summary":" Diffusion models have achieved significant success in image and video\ngeneration. This motivates a growing interest in video editing tasks, where\nvideos are edited according to provided text descriptions. However, most\nexisting approaches only focus on video editing for short clips and rely on\ntime-consuming tuning or inference. We are the first to propose Video\nInstruction Diffusion (VIDiff), a unified foundation model designed for a wide\nrange of video tasks. These tasks encompass both understanding tasks (such as\nlanguage-guided video object segmentation) and generative tasks (video editing\nand enhancement). Our model can edit and translate the desired results within\nseconds based on user instructions. Moreover, we design an iterative\nauto-regressive method to ensure consistency in editing and enhancing long\nvideos. We provide convincing generative results for diverse input videos and\nwritten instructions, both qualitatively and quantitatively. More examples can\nbe found at our website https://ChenHsing.github.io/VIDiff.\n","authors":["Zhen Xing","Qi Dai","Zihao Zhang","Hui Zhang","Han Hu","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.18837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18827v1","updated":"2023-11-30T18:59:06Z","published":"2023-11-30T18:59:06Z","title":"Motion-Conditioned Image Animation for Video Editing","summary":" We introduce MoCA, a Motion-Conditioned Image Animation approach for video\nediting. It leverages a simple decomposition of the video editing problem into\nimage editing followed by motion-conditioned image animation. Furthermore,\ngiven the lack of robust evaluation datasets for video editing, we introduce a\nnew benchmark that measures edit capability across a wide variety of tasks,\nsuch as object replacement, background changes, style changes, and motion\nedits. We present a comprehensive human evaluation of the latest video editing\nmethods along with MoCA, on our proposed benchmark. MoCA establishes a new\nstate-of-the-art, demonstrating greater human preference win-rate, and\noutperforming notable recent approaches including Dreamix (63%), MasaCtrl\n(75%), and Tune-A-Video (72%), with especially significant improvements for\nmotion edits.\n","authors":["Wilson Yan","Andrew Brown","Pieter Abbeel","Rohit Girdhar","Samaneh Azadi"],"pdf_url":"https://arxiv.org/pdf/2311.18827v1.pdf","comment":"Project page: https://facebookresearch.github.io/MoCA"},{"id":"http://arxiv.org/abs/2311.18788v1","updated":"2023-11-30T18:37:21Z","published":"2023-11-30T18:37:21Z","title":"Automated interpretation of congenital heart disease from multi-view\n echocardiograms","summary":" Congenital heart disease (CHD) is the most common birth defect and the\nleading cause of neonate death in China. Clinical diagnosis can be based on the\nselected 2D key-frames from five views. Limited by the availability of\nmulti-view data, most methods have to rely on the insufficient single view\nanalysis. This study proposes to automatically analyze the multi-view\nechocardiograms with a practical end-to-end framework. We collect the five-view\nechocardiograms video records of 1308 subjects (including normal controls,\nventricular septal defect (VSD) patients and atrial septal defect (ASD)\npatients) with both disease labels and standard-view key-frame labels.\nDepthwise separable convolution-based multi-channel networks are adopted to\nlargely reduce the network parameters. We also approach the imbalanced class\nproblem by augmenting the positive training samples. Our 2D key-frame model can\ndiagnose CHD or negative samples with an accuracy of 95.4\\%, and in negative,\nVSD or ASD classification with an accuracy of 92.3\\%. To further alleviate the\nwork of key-frame selection in real-world implementation, we propose an\nadaptive soft attention scheme to directly explore the raw video data. Four\nkinds of neural aggregation methods are systematically investigated to fuse the\ninformation of an arbitrary number of frames in a video. Moreover, with a view\ndetection module, the system can work without the view records. Our video-based\nmodel can diagnose with an accuracy of 93.9\\% (binary classification), and\n92.1\\% (3-class classification) in a collected 2D video testing set, which does\nnot need key-frame selection and view annotation in testing. The detailed\nablation study and the interpretability analysis are provided.\n","authors":["Jing Wang","Xiaofeng Liu","Fangyun Wang","Lin Zheng","Fengqiao Gao","Hanwen Zhang","Xin Zhang","Wanqing Xie","Binbin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18788v1.pdf","comment":"Published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2311.18664v1","updated":"2023-11-30T16:13:17Z","published":"2023-11-30T16:13:17Z","title":"Multi-task learning with cross-task consistency for improved depth\n estimation in colonoscopy","summary":" Colonoscopy screening is the gold standard procedure for assessing\nabnormalities in the colon and rectum, such as ulcers and cancerous polyps.\nMeasuring the abnormal mucosal area and its 3D reconstruction can help quantify\nthe surveyed area and objectively evaluate disease burden. However, due to the\ncomplex topology of these organs and variable physical conditions, for example,\nlighting, large homogeneous texture, and image modality estimating distance\nfrom the camera aka depth) is highly challenging. Moreover, most colonoscopic\nvideo acquisition is monocular, making the depth estimation a non-trivial\nproblem. While methods in computer vision for depth estimation have been\nproposed and advanced on natural scene datasets, the efficacy of these\ntechniques has not been widely quantified on colonoscopy datasets. As the\ncolonic mucosa has several low-texture regions that are not well pronounced,\nlearning representations from an auxiliary task can improve salient feature\nextraction, allowing estimation of accurate camera depths. In this work, we\npropose to develop a novel multi-task learning (MTL) approach with a shared\nencoder and two decoders, namely a surface normal decoder and a depth estimator\ndecoder. Our depth estimator incorporates attention mechanisms to enhance\nglobal context awareness. We leverage the surface normal prediction to improve\ngeometric feature extraction. Also, we apply a cross-task consistency loss\namong the two geometrically related tasks, surface normal and camera depth. We\ndemonstrate an improvement of 14.17% on relative error and 10.4% improvement on\n$\\delta_{1}$ accuracy over the most accurate baseline state-of-the-art BTS\napproach. All experiments are conducted on a recently released C3VD dataset;\nthus, we provide a first benchmark of state-of-the-art methods.\n","authors":["Pedro Esteban Chavarrias Solano","Andrew Bulpitt","Venkataraman Subramanian","Sharib Ali"],"pdf_url":"https://arxiv.org/pdf/2311.18664v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2309.06978v3","updated":"2023-11-30T15:23:21Z","published":"2023-09-13T14:13:08Z","title":"Differentiable JPEG: The Devil is in the Details","summary":" JPEG remains one of the most widespread lossy image coding methods. However,\nthe non-differentiable nature of JPEG restricts the application in deep\nlearning pipelines. Several differentiable approximations of JPEG have recently\nbeen proposed to address this issue. This paper conducts a comprehensive review\nof existing diff. JPEG approaches and identifies critical details that have\nbeen missed by previous methods. To this end, we propose a novel diff. JPEG\napproach, overcoming previous limitations. Our approach is differentiable\nw.r.t. the input image, the JPEG quality, the quantization tables, and the\ncolor conversion parameters. We evaluate the forward and backward performance\nof our diff. JPEG approach against existing methods. Additionally, extensive\nablations are performed to evaluate crucial design choices. Our proposed diff.\nJPEG resembles the (non-diff.) reference implementation best, significantly\nsurpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For\nstrong compression rates, we can even improve PSNR by $9.51$dB. Strong\nadversarial attack results are yielded by our diff. JPEG, demonstrating the\neffective gradient approximation. Our code is available at\nhttps://github.com/necla-ml/Diff-JPEG.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2309.06978v3.pdf","comment":"Accepted at WACV 2024. Project page:\n https://christophreich1996.github.io/differentiable_jpeg/"},{"id":"http://arxiv.org/abs/2111.12727v3","updated":"2023-11-30T11:47:36Z","published":"2021-11-24T19:00:05Z","title":"Generating More Pertinent Captions by Leveraging Semantics and Style on\n Multi-Source Datasets","summary":" This paper addresses the task of generating fluent descriptions by training\non a non-uniform combination of data sources, containing both human-annotated\nand web-collected captions. Large-scale datasets with noisy image-text pairs,\nindeed, provide a sub-optimal source of supervision because of their\nlow-quality descriptive style, while human-annotated datasets are cleaner but\nsmaller in scale. To get the best of both worlds, we propose to leverage and\nseparate semantics and descriptive style through the incorporation of a style\ntoken and keywords extracted through a retrieval component. The proposed model\navoids the need of object detectors, is trained with a single objective of\nprompt language modeling, and can replicate the style of human-collected\ncaptions while training on sources with different input styles. Experimentally,\nthe model shows a strong capability of recognizing real-world concepts and\nproducing high-quality captions. Extensive experiments are performed on\ndifferent image captioning datasets, including CC3M, nocaps, and the\ncompetitive COCO dataset, where our model consistently outperforms baselines\nand state-of-the-art approaches.\n","authors":["Marcella Cornia","Lorenzo Baraldi","Giuseppe Fiameni","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2111.12727v3.pdf","comment":"Accepted to IJCV"},{"id":"http://arxiv.org/abs/2309.05950v3","updated":"2023-11-30T10:35:40Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities on downstream tasks when fine-tuned with\nminimal data. However, many VLMs rely on proprietary data and are not\nopen-source, which restricts the use of white-box approaches for fine-tuning.\nAs such, we aim to develop a black-box approach to optimize VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or even output logits. We propose employing chat-based LLMs\nto search for the best text prompt for VLMs. Specifically, we adopt an\nautomatic hill-climbing procedure that converges to an effective prompt by\nevaluating the performance of current prompts and asking LLMs to refine them\nbased on textual feedback, all within a conversational process without\nhuman-in-the-loop. In a challenging 1-shot image classification setup, our\nsimple approach surpasses the white-box continuous prompting method (CoOp) by\nan average of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms both human-engineered and LLM-generated prompts. We highlight the\nadvantage of conversational feedback that incorporates both positive and\nnegative prompts, suggesting that LLMs can utilize the implicit gradient\ndirection in textual feedback for a more efficient search. In addition, we find\nthat the text prompts generated through our strategy are not only more\ninterpretable but also transfer well across different VLM architectures in a\nblack-box manner. Lastly, we demonstrate our framework on a state-of-the-art\nblack-box VLM (DALL-E 3) for text-to-image optimization.\n","authors":["Shihong Liu","Zhiqiu Lin","Samuel Yu","Ryan Lee","Tiffany Ling","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v3.pdf","comment":"Project site: llm-can-optimize-vlm.github.io"},{"id":"http://arxiv.org/abs/2311.18273v1","updated":"2023-11-30T06:23:15Z","published":"2023-11-30T06:23:15Z","title":"HKUST at SemEval-2023 Task 1: Visual Word Sense Disambiguation with\n Context Augmentation and Visual Assistance","summary":" Visual Word Sense Disambiguation (VWSD) is a multi-modal task that aims to\nselect, among a batch of candidate images, the one that best entails the target\nword's meaning within a limited context. In this paper, we propose a\nmulti-modal retrieval framework that maximally leverages pretrained\nVision-Language models, as well as open knowledge bases and datasets. Our\nsystem consists of the following key components: (1) Gloss matching: a\npretrained bi-encoder model is used to match contexts with proper senses of the\ntarget words; (2) Prompting: matched glosses and other textual information,\nsuch as synonyms, are incorporated using a prompting template; (3) Image\nretrieval: semantically matching images are retrieved from large open datasets\nusing prompts as queries; (4) Modality fusion: contextual information from\ndifferent modalities are fused and used for prediction. Although our system\ndoes not produce the most competitive results at SemEval-2023 Task 1, we are\nstill able to beat nearly half of the teams. More importantly, our experiments\nreveal acute insights for the field of Word Sense Disambiguation (WSD) and\nmulti-modal learning. Our code is available on GitHub.\n","authors":["Zhuohao Yin","Xin Huang"],"pdf_url":"https://arxiv.org/pdf/2311.18273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18248v1","updated":"2023-11-30T04:43:26Z","published":"2023-11-30T04:43:26Z","title":"mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large\n Language Model","summary":" Recently, the strong text creation ability of Large Language Models(LLMs) has\ngiven rise to many tools for assisting paper reading or even writing. However,\nthe weak diagram analysis abilities of LLMs or Multimodal LLMs greatly limit\ntheir application scenarios, especially for scientific academic paper writing.\nIn this work, towards a more versatile copilot for academic paper writing, we\nmainly focus on strengthening the multi-modal diagram analysis ability of\nMultimodal LLMs. By parsing Latex source files of high-quality papers, we\ncarefully build a multi-modal diagram understanding dataset M-Paper. By\naligning diagrams in the paper with related paragraphs, we construct\nprofessional diagram analysis samples for training and evaluation. M-Paper is\nthe first dataset to support joint comprehension of multiple scientific\ndiagrams, including figures and tables in the format of images or Latex codes.\nBesides, to better align the copilot with the user's intention, we introduce\nthe `outline' as the control signal, which could be directly given by the user\nor revised based on auto-generated ones. Comprehensive experiments with a\nstate-of-the-art Mumtimodal LLM demonstrate that training on our dataset shows\nstronger scientific diagram understanding performance, including diagram\ncaptioning, diagram analysis, and outline recommendation. The dataset, code,\nand model are available at\nhttps://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl.\n","authors":["Anwen Hu","Yaya Shi","Haiyang Xu","Jiabo Ye","Qinghao Ye","Ming Yan","Chenliang Li","Qi Qian","Ji Zhang","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2311.18248v1.pdf","comment":"20 pages, 12 figures. arXiv admin note: text overlap with\n arXiv:2305.15225 by other authors"},{"id":"http://arxiv.org/abs/2311.18243v1","updated":"2023-11-30T04:21:10Z","published":"2023-11-30T04:21:10Z","title":"DKiS: Decay weight invertible image steganography with private key","summary":" Image steganography, the practice of concealing information within another\nimage, traditionally faces security challenges when its methods become publicly\nknown. To counteract this, we introduce a novel private key-based image\nsteganography technique. This approach ensures the security of hidden\ninformation, requiring a corresponding private key for access, irrespective of\nthe public knowledge of the steganography method. We present experimental\nevidence demonstrating our method's effectiveness, showcasing its real-world\napplicability. Additionally, we identified a critical challenge in the\ninvertible image steganography process: the transfer of non-essential, or\n`garbage', information from the secret to the host pipeline. To address this,\nwe introduced the decay weight to control the information transfer, filtering\nout irrelevant data and enhancing the performance of image steganography. Our\ncode is publicly accessible at https://github.com/yanghangAI/DKiS, and a\npractical demonstration is available at http://yanghang.site/hidekey.\n","authors":["Hang Yang","Yitian Xu","Xuhua Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18216v1","updated":"2023-11-30T03:20:42Z","published":"2023-11-30T03:20:42Z","title":"FS-BAND: A Frequency-Sensitive Banding Detector","summary":" Banding artifact, as known as staircase-like contour, is a common quality\nannoyance that happens in compression, transmission, etc. scenarios, which\nlargely affects the user's quality of experience (QoE). The banding distortion\ntypically appears as relatively small pixel-wise variations in smooth\nbackgrounds, which is difficult to analyze in the spatial domain but easily\nreflected in the frequency domain. In this paper, we thereby study the banding\nartifact from the frequency aspect and propose a no-reference banding detection\nmodel to capture and evaluate banding artifacts, called the Frequency-Sensitive\nBANding Detector (FS-BAND). The proposed detector is able to generate a\npixel-wise banding map with a perception correlated quality score. Experimental\nresults show that the proposed FS-BAND method outperforms state-of-the-art\nimage quality assessment (IQA) approaches with higher accuracy in banding\nclassification task.\n","authors":["Zijian Chen","Wei Sun","Zicheng Zhang","Ru Huang","Fangfang Lu","Xiongkuo Min","Guangtao Zhai","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18216v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2311.17752"},{"id":"http://arxiv.org/abs/2311.18248v1","updated":"2023-11-30T04:43:26Z","published":"2023-11-30T04:43:26Z","title":"mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large\n Language Model","summary":" Recently, the strong text creation ability of Large Language Models(LLMs) has\ngiven rise to many tools for assisting paper reading or even writing. However,\nthe weak diagram analysis abilities of LLMs or Multimodal LLMs greatly limit\ntheir application scenarios, especially for scientific academic paper writing.\nIn this work, towards a more versatile copilot for academic paper writing, we\nmainly focus on strengthening the multi-modal diagram analysis ability of\nMultimodal LLMs. By parsing Latex source files of high-quality papers, we\ncarefully build a multi-modal diagram understanding dataset M-Paper. By\naligning diagrams in the paper with related paragraphs, we construct\nprofessional diagram analysis samples for training and evaluation. M-Paper is\nthe first dataset to support joint comprehension of multiple scientific\ndiagrams, including figures and tables in the format of images or Latex codes.\nBesides, to better align the copilot with the user's intention, we introduce\nthe `outline' as the control signal, which could be directly given by the user\nor revised based on auto-generated ones. Comprehensive experiments with a\nstate-of-the-art Mumtimodal LLM demonstrate that training on our dataset shows\nstronger scientific diagram understanding performance, including diagram\ncaptioning, diagram analysis, and outline recommendation. The dataset, code,\nand model are available at\nhttps://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl.\n","authors":["Anwen Hu","Yaya Shi","Haiyang Xu","Jiabo Ye","Qinghao Ye","Ming Yan","Chenliang Li","Qi Qian","Ji Zhang","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2311.18248v1.pdf","comment":"20 pages, 12 figures"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..17e9a5fe --- /dev/null +++ b/index.html @@ -0,0 +1,63566 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 60 + +
+
+
+ + ☆ What Do Llamas Really Think? Revealing Preference Biases in Language + Model Representations + + +
+ Do large language models (LLMs) exhibit sociodemographic biases, even when +they decline to respond? To bypass their refusal to "speak," we study this +research question by probing contextualized embeddings and exploring whether +this bias is encoded in its latent representations. We propose a logistic +Bradley-Terry probe which predicts word pair preferences of LLMs from the +words' hidden vectors. We first validate our probe on three pair preference +tasks and thirteen LLMs, where we outperform the word embedding association +test (WEAT), a standard approach in testing for implicit association, by a +relative 27% in error rate. We also find that word pair preferences are best +represented in the middle layers. Next, we transfer probes trained on harmless +tasks (e.g., pick the larger number) to controversial ones (compare +ethnicities) to examine biases in nationality, politics, religion, and gender. +We observe substantial bias for all target classes: for instance, the Mistral +model implicitly prefers Europe to Africa, Christianity to Judaism, and +left-wing to right-wing politics, despite declining to answer. This suggests +that instruction fine-tuning does not necessarily debias contextualized +embeddings. Our codebase is at https://github.com/castorini/biasprobe. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Unnatural Error Correction: GPT-4 Can Almost Perfectly Handle Unnatural + Scrambled Text EMNLP 2023 + + +
+ While Large Language Models (LLMs) have achieved remarkable performance in +many tasks, much about their inner workings remains unclear. In this study, we +present novel experimental insights into the resilience of LLMs, particularly +GPT-4, when subjected to extensive character-level permutations. To investigate +this, we first propose the Scrambled Bench, a suite designed to measure the +capacity of LLMs to handle scrambled input, in terms of both recovering +scrambled sentences and answering questions given scrambled context. The +experimental results indicate that most powerful LLMs demonstrate the +capability akin to typoglycemia, a phenomenon where humans can understand the +meaning of words even when the letters within those words are scrambled, as +long as the first and last letters remain in place. More surprisingly, we found +that only GPT-4 nearly flawlessly processes inputs with unnatural errors, even +under the extreme condition, a task that poses significant challenges for other +LLMs and often even for humans. Specifically, GPT-4 can almost perfectly +reconstruct the original sentences from scrambled ones, decreasing the edit +distance by 95%, even when all letters within each word are entirely scrambled. +It is counter-intuitive that LLMs can exhibit such resilience despite severe +disruption to input tokenization caused by scrambled text. + +
+
+ comment: EMNLP 2023 (with an additional analysis section in appendix) +
+
+
+
+
+ + ☆ BIOCLIP: A Vision Foundation Model for the Tree of Life + + +
+ Images of the natural world, collected by a variety of cameras, from drones +to individual phones, are increasingly abundant sources of biological +information. There is an explosion of computational methods and tools, +particularly computer vision, for extracting biologically relevant information +from images for science and conservation. Yet most of these are bespoke +approaches designed for a specific task and are not easily adaptable or +extendable to new questions, contexts, and datasets. A vision model for general +organismal biology questions on images is of timely need. To approach this, we +curate and release TreeOfLife-10M, the largest and most diverse ML-ready +dataset of biology images. We then develop BioCLIP, a foundation model for the +tree of life, leveraging the unique properties of biology captured by +TreeOfLife-10M, namely the abundance and variety of images of plants, animals, +and fungi, together with the availability of rich structured biological +knowledge. We rigorously benchmark our approach on diverse fine-grained biology +classification tasks, and find that BioCLIP consistently and substantially +outperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation +reveals that BioCLIP has learned a hierarchical representation conforming to +the tree of life, shedding light on its strong generalizability. Our code, +models and data will be made available at +https://github.com/Imageomics/bioclip. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ X-InstructBLIP: A Framework for aligning X-Modal instruction-aware + representations to LLMs and Emergent Cross-modal Reasoning + + +
+ Vision-language pre-training and instruction tuning have demonstrated +general-purpose capabilities in 2D visual reasoning tasks by aligning visual +encoders with state-of-the-art large language models (LLMs). In this paper, we +introduce a simple, yet effective, cross-modality framework built atop frozen +LLMs that allows the integration of various modalities without extensive +modality-specific customization. To facilitate instruction-modality +fine-tuning, we collect high-quality instruction tuning data in an automatic +and scalable manner, composed of 24K QA samples for audio and 250K QA samples +for 3D. Leveraging instruction-aware representations, our model performs +comparably with leading-edge counterparts without the need of extensive +modality-specific pre-training or customization. Furthermore, our approach +demonstrates cross-modal reasoning abilities across two or more input +modalities, despite each modality projection being trained individually. To +study the model's cross-modal abilities, we contribute a novel Discriminative +Cross-modal Reasoning (DisCRn) evaluation task, comprising 9K audio-video QA +samples and 28K image-3D QA samples that require the model to reason +discriminatively across disparate input modalities. + +
+
+
+
+
+ + ☆ Mavericks at BLP-2023 Task 1: Ensemble-based Approach Using Language + Models for Violence Inciting Text Detection EMNLP 2023 + + +
+ This paper presents our work for the Violence Inciting Text Detection shared +task in the First Workshop on Bangla Language Processing. Social media has +accelerated the propagation of hate and violence-inciting speech in society. It +is essential to develop efficient mechanisms to detect and curb the propagation +of such texts. The problem of detecting violence-inciting texts is further +exacerbated in low-resource settings due to sparse research and less data. The +data provided in the shared task consists of texts in the Bangla language, +where each example is classified into one of the three categories defined based +on the types of violence-inciting texts. We try and evaluate several BERT-based +models, and then use an ensemble of the models as our final submission. Our +submission is ranked 10th in the final leaderboard of the shared task with a +macro F1 score of 0.737. + +
+
+ comment: 6 pages, 1 figure, accepted at the BLP Workshop, EMNLP 2023 +
+
+
+
+
+ + ☆ CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation + + +
+ We present CoDi-2, a versatile and interactive Multimodal Large Language +Model (MLLM) that can follow complex multimodal interleaved instructions, +conduct in-context learning (ICL), reason, chat, edit, etc., in an any-to-any +input-output modality paradigm. By aligning modalities with language for both +encoding and generation, CoDi-2 empowers Large Language Models (LLMs) to not +only understand complex modality-interleaved instructions and in-context +examples, but also autoregressively generate grounded and coherent multimodal +outputs in the continuous feature space. To train CoDi-2, we build a +large-scale generation dataset encompassing in-context multimodal instructions +across text, vision, and audio. CoDi-2 demonstrates a wide range of zero-shot +capabilities for multimodal generation, such as in-context learning, reasoning, +and compositionality of any-to-any modality generation through multi-round +interactive conversation. CoDi-2 surpasses previous domain-specific models on +tasks such as subject-driven image generation, vision transformation, and audio +editing. CoDi-2 signifies a substantial breakthrough in developing a +comprehensive multimodal foundation model adept at interpreting in-context +language-vision-audio interleaved instructions and producing multimodal +outputs. + +
+
+ comment: Project Page: https://codi-2.github.io/ +
+
+
+
+
+ + ☆ MLLMs-Augmented Visual-Language Representation Learning + + +
+ Visual-language pre-training (VLP) have achieved remarkable success in +multi-modal tasks, largely attributed to the availability of large-scale +image-text datasets. In this work, we demonstrate that multi-modal large +language models (MLLMs) can enhance visual-language representation learning by +improving data quality. Our approach is simple, utilizing MLLMs to extend +multiple captions for each image. To prevent the bias that introduced by MLLMs' +hallucinations and intrinsic caption styles, we propose a "text shearing" to +keep the lengths of extended captions identical to the originals. In image-text +retrieval, our method consistently obtains 5.6 ~ 35.0% and 16.8 ~ 46.1% +improvement on R@1 under the fine-tuning and zero-shot settings, respectively. +Notably, our zero-shot results are comparable to fine-tuning on target +datasets, which encourages more exploration on the versatile use of MLLMs. + +
+
+
+
+
+ + ☆ Can training neural language models on a curriculum with developmentally + plausible data improve alignment with human reading behavior? CoNLL 2023 + + +
+ The use of neural language models to model human behavior has met with mixed +success. While some work has found that the surprisal estimates from these +models can be used to predict a wide range of human neural and behavioral +responses, other work studying more complex syntactic phenomena has found that +these surprisal estimates generate incorrect behavioral predictions. This paper +explores the extent to which the misalignment between empirical and +model-predicted behavior can be minimized by training models on more +developmentally plausible data, such as in the BabyLM Challenge. We trained +teacher language models on the BabyLM "strict-small" dataset and used sentence +level surprisal estimates from these teacher models to create a curriculum. We +found tentative evidence that our curriculum made it easier for models to +acquire linguistic knowledge from the training data: on the subset of tasks in +the BabyLM challenge suite evaluating models' grammatical knowledge of English, +models first trained on the BabyLM data curriculum and then on a few randomly +ordered training epochs performed slightly better than models trained on +randomly ordered epochs alone. This improved linguistic knowledge acquisition +did not result in better alignment with human reading behavior, however: models +trained on the BabyLM dataset (with or without a curriculum) generated +predictions that were as misaligned with human behavior as models trained on +larger less curated datasets. This suggests that training on developmentally +plausible datasets alone is likely insufficient to generate language models +capable of accurately predicting human language processing. + +
+
+ comment: To appear in the proceedings of BabyLM shared task CoNLL 2023 +
+
+
+
+
+ + ☆ TaskBench: Benchmarking Large Language Models for Task Automation + + +
+ Recently, the incredible progress of large language models (LLMs) has ignited +the spark of task automation, which decomposes the complex tasks described by +user instructions into sub-tasks, and invokes external tools to execute them, +and plays a central role in autonomous agents. However, there lacks a +systematic and standardized benchmark to foster the development of LLMs in task +automation. To this end, we introduce TaskBench to evaluate the capability of +LLMs in task automation. Specifically, task automation can be formulated into +three critical stages: task decomposition, tool invocation, and parameter +prediction to fulfill user intent. This complexity makes data collection and +evaluation more challenging compared to common NLP tasks. To generate +high-quality evaluation datasets, we introduce the concept of Tool Graph to +represent the decomposed tasks in user intent, and adopt a back-instruct method +to simulate user instruction and annotations. Furthermore, we propose TaskEval +to evaluate the capability of LLMs from different aspects, including task +decomposition, tool invocation, and parameter prediction. Experimental results +demonstrate that TaskBench can effectively reflects the capability of LLMs in +task automation. Benefiting from the mixture of automated data construction and +human verification, TaskBench achieves a high consistency compared to the human +evaluation, which can be utilized as a comprehensive and faithful benchmark for +LLM-based autonomous agents. + +
+
+
+
+
+ + ☆ Language Model Agents Suffer from Compositional Generalization in Web + Automation + + +
+ Language model agents (LMA) recently emerged as a promising paradigm on +muti-step decision making tasks, often outperforming humans and other +reinforcement learning agents. Despite the promise, their performance on +real-world applications that often involve combinations of tasks is still +underexplored. In this work, we introduce a new benchmark, called CompWoB -- 50 +new compositional web automation tasks reflecting more realistic assumptions. +We show that while existing prompted LMAs (gpt-3.5-turbo or gpt-4) achieve +94.0% average success rate on base tasks, their performance degrades to 24.9% +success rate on compositional tasks. On the other hand, transferred LMAs +(finetuned only on base tasks) show less generalization gap, dropping from +85.4% to 54.8%. By balancing data distribution across tasks, we train a new +model, HTML-T5++, that surpasses human-level performance (95.2%) on MiniWoB, +and achieves the best zero-shot performance on CompWoB (61.5%). While these +highlight the promise of small-scale finetuned and transferred models for +compositional generalization, their performance further degrades under +different instruction compositions changing combinational order. In contrast to +the recent remarkable success of LMA, our benchmark and detailed analysis +emphasize the necessity of building LMAs that are robust and generalizable to +task compositionality for real-world deployment. + +
+
+ comment: Code: + https://github.com/google-research/google-research/tree/master/compositional_rl/compwob +
+
+
+
+
+ + ☆ AlignBench: Benchmarking Chinese Alignment of Large Language Models + + +
+ Alignment has become a critical step for instruction-tuned Large Language +Models (LLMs) to become helpful assistants. However, effective evaluation of +alignment for emerging Chinese LLMs is still significantly lacking, calling for +real-scenario grounded, open-ended, challenging and automatic evaluations +tailored for alignment. To fill in this gap, we introduce AlignBench, a +comprehensive multi-dimensional benchmark for evaluating LLMs' alignment in +Chinese. Equipped with a human-in-the-loop data curation pipeline, our +benchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with +Chain-of-Thought to generate explanations and final ratings as evaluations, +ensuring high reliability and interpretability. Furthermore, we developed a +dedicated companion evaluator LLM -- CritiqueLLM, which recovers 95\% of +GPT-4's evaluation ability and will be provided via public APIs to researchers +for evaluation of alignment in Chinese LLMs. All evaluation codes, data, and +LLM generations are available at \url{https://github.com/THUDM/AlignBench}. + +
+
+
+
+
+ + ☆ Mavericks at NADI 2023 Shared Task: Unravelling Regional Nuances through + Dialect Identification using Transformer-based Approach EMNLP + 2023 + + +
+ In this paper, we present our approach for the "Nuanced Arabic Dialect +Identification (NADI) Shared Task 2023". We highlight our methodology for +subtask 1 which deals with country-level dialect identification. Recognizing +dialects plays an instrumental role in enhancing the performance of various +downstream NLP tasks such as speech recognition and translation. The task uses +the Twitter dataset (TWT-2023) that encompasses 18 dialects for the multi-class +classification problem. Numerous transformer-based models, pre-trained on +Arabic language, are employed for identifying country-level dialects. We +fine-tune these state-of-the-art models on the provided dataset. The ensembling +method is leveraged to yield improved performance of the system. We achieved an +F1-score of 76.65 (11th rank on the leaderboard) on the test dataset. + +
+
+ comment: 5 pages, 1 figure, accepted at the NADI ArabicNLP Workshop, EMNLP + 2023 +
+
+
+
+
+ + ☆ Mavericks at ArAIEval Shared Task: Towards a Safer Digital Space -- + Transformer Ensemble Models Tackling Deception and Persuasion EMNLP + + +
+ In this paper, we highlight our approach for the "Arabic AI Tasks Evaluation +(ArAiEval) Shared Task 2023". We present our approaches for task 1-A and task +2-A of the shared task which focus on persuasion technique detection and +disinformation detection respectively. Detection of persuasion techniques and +disinformation has become imperative to avoid distortion of authentic +information. The tasks use multigenre snippets of tweets and news articles for +the given binary classification problem. We experiment with several +transformer-based models that were pre-trained on the Arabic language. We +fine-tune these state-of-the-art models on the provided dataset. Ensembling is +employed to enhance the performance of the systems. We achieved a micro +F1-score of 0.742 on task 1-A (8th rank on the leaderboard) and 0.901 on task +2-A (7th rank on the leaderboard) respectively. + +
+
+ comment: 6 pages, 1 figure, accepted at the ArAIEval ArabicNLP workshop, EMNLP + conference 2023 +
+
+
+
+
+ + ☆ Automatic Functional Differentiation in JAX + + +
+ We extend JAX with the capability to automatically differentiate higher-order +functions (functionals and operators). By representing functions as a +generalization of arrays, we seamlessly use JAX's existing primitive system to +implement higher-order functions. We present a set of primitive operators that +serve as foundational building blocks for constructing several key types of +functionals. For every introduced primitive operator, we derive and implement +both linearization and transposition rules, aligning with JAX's internal +protocols for forward and reverse mode automatic differentiation. This +enhancement allows for functional differentiation in the same syntax +traditionally use for functions. The resulting functional gradients are +themselves functions ready to be invoked in python. We showcase this tool's +efficacy and simplicity through applications where functional derivatives are +indispensable. The source code of this work is released at +https://github.com/sail-sg/autofd . + +
+
+
+
+
+ + ☆ CoRec: An Easy Approach for Coordination Recognition EMNLP 2023 + + +
+ In this paper, we observe and address the challenges of the coordination +recognition task. Most existing methods rely on syntactic parsers to identify +the coordinators in a sentence and detect the coordination boundaries. However, +state-of-the-art syntactic parsers are slow and suffer from errors, especially +for long and complicated sentences. To better solve the problems, we propose a +pipeline model COordination RECognizer (CoRec). It consists of two components: +coordinator identifier and conjunct boundary detector. The experimental results +on datasets from various domains demonstrate the effectiveness and efficiency +of the proposed method. Further experiments show that CoRec positively impacts +downstream tasks, improving the yield of state-of-the-art Open IE models. + +
+
+ comment: Accepted by EMNLP 2023 Main Conference (oral presentation) +
+
+
+
+
+ + ☆ Women Are Beautiful, Men Are Leaders: Gender Stereotypes in Machine + Translation and Language Modeling + + +
+ We present GEST -- a new dataset for measuring gender-stereotypical reasoning +in masked LMs and English-to-X machine translation systems. GEST contains +samples that are compatible with 9 Slavic languages and English for 16 gender +stereotypes about men and women (e.g., Women are beautiful, Men are leaders). +The definition of said stereotypes was informed by gender experts. We used GEST +to evaluate 11 masked LMs and 4 machine translation systems. We discovered +significant and consistent amounts of stereotypical reasoning in almost all the +evaluated models and languages. + +
+
+
+
+
+ + ☆ CritiqueLLM: Scaling LLM-as-Critic for Effective and Explainable + Evaluation of Large Language Model Generation + + +
+ Since the natural language processing (NLP) community started to make large +language models (LLMs), such as GPT-4, act as a critic to evaluate the quality +of generated texts, most of them only train a critique generation model of a +specific scale on specific datasets. We argue that a comprehensive +investigation on the key factor of LLM-based evaluation models, such as scaling +properties, is lacking, so that it is still inconclusive whether these models +have potential to replace GPT-4's evaluation in practical scenarios. In this +paper, we propose a new critique generation model called CritiqueLLM, which +includes a dialogue-based prompting method for high-quality referenced / +reference-free evaluation data. Experimental results show that our model can +achieve comparable evaluation performance to GPT-4 especially in system-level +correlations, and even outperform GPT-4 in 3 out of 8 tasks in a challenging +reference-free setting. We conduct detailed analysis to show promising scaling +properties of our model in the quality of generated critiques. We also +demonstrate that our generated critiques can act as scalable feedback to +directly improve the generation quality of LLMs. + +
+
+ comment: 18 pages, 5 figures +
+
+
+
+
+ + ☆ RaDialog: A Large Vision-Language Model for Radiology Report Generation + and Conversational Assistance + + +
+ Conversational AI tools that can generate and discuss clinically correct +radiology reports for a given medical image have the potential to transform +radiology. Such a human-in-the-loop radiology assistant could facilitate a +collaborative diagnostic process, thus saving time and improving the quality of +reports. Towards this goal, we introduce RaDialog, the first thoroughly +evaluated and publicly available large vision-language model for radiology +report generation and interactive dialog. RaDialog effectively integrates +visual image features and structured pathology findings with a large language +model (LLM) while simultaneously adapting it to a specialized domain using +parameter-efficient fine-tuning. To keep the conversational abilities of the +underlying LLM, we propose a comprehensive, semi-automatically labeled, +image-grounded instruct dataset for chest X-ray radiology tasks. By training +with this dataset, our method achieves state-of-the-art clinical correctness in +report generation and shows impressive abilities in interactive tasks such as +correcting reports and answering questions, serving as a foundational step +toward clinical dialog systems. Our code is available on github: +https://github.com/ChantalMP/RaDialog. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ ArcMMLU: A Library and Information Science Benchmark for Large Language + Models + + +
+ In light of the rapidly evolving capabilities of large language models +(LLMs), it becomes imperative to develop rigorous domain-specific evaluation +benchmarks to accurately assess their capabilities. In response to this need, +this paper introduces ArcMMLU, a specialized benchmark tailored for the Library +& Information Science (LIS) domain in Chinese. This benchmark aims to measure +the knowledge and reasoning capability of LLMs within four key sub-domains: +Archival Science, Data Science, Library Science, and Information Science. +Following the format of MMLU/CMMLU, we collected over 6,000 high-quality +questions for the compilation of ArcMMLU. This extensive compilation can +reflect the diverse nature of the LIS domain and offer a robust foundation for +LLM evaluation. Our comprehensive evaluation reveals that while most mainstream +LLMs achieve an average accuracy rate above 50% on ArcMMLU, there remains a +notable performance gap, suggesting substantial headroom for refinement in LLM +capabilities within the LIS domain. Further analysis explores the effectiveness +of few-shot examples on model performance and highlights challenging questions +where models consistently underperform, providing valuable insights for +targeted improvements. ArcMMLU fills a critical gap in LLM evaluations within +the Chinese LIS domain and paves the way for future development of LLMs +tailored to this specialized area. + +
+
+
+
+
+ + ☆ ArthModel: Enhance Arithmetic Skills to Large Language Model + + +
+ With the great success of ChatGPT, the research of large language models has +become increasingly popular. However, the models have several limitations, such +as toxicity and pool performance of arithmetic solving. Meanwhile, LLM may have +some potential abilities that have yet to be exploited. In this paper, we +choose a different way to enhance the arithmetic ability of LLM. We propose to +train LLM to generate a postfix expression related to the arithmetic problem +and incorporate it with small pretrained models. Moreover, this small model +transfers the token embeddings into real dense numbers and invokes native +functions of a deep learning platform to get the correct answer. To generate +the final result, we propose prompt injection for adding the result outputs by +the small model to LLM. This work provides different ways of thinking, training +and using a language model. The codes and models will be released at +\url{https://github.com/eteced/arithmetic_finetuning_v1}. + +
+
+ comment: 7 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ FFT: Towards Harmlessness Evaluation and Analysis for LLMs with + Factuality, Fairness, Toxicity + + +
+ The widespread of generative artificial intelligence has heightened concerns +about the potential harms posed by AI-generated texts, primarily stemming from +factoid, unfair, and toxic content. Previous researchers have invested much +effort in assessing the harmlessness of generative language models. However, +existing benchmarks are struggling in the era of large language models (LLMs), +due to the stronger language generation and instruction following capabilities, +as well as wider applications. In this paper, we propose FFT, a new benchmark +with 2116 elaborated-designed instances, for LLM harmlessness evaluation with +factuality, fairness, and toxicity. To investigate the potential harms of LLMs, +we evaluate 9 representative LLMs covering various parameter scales, training +stages, and creators. Experiments show that the harmlessness of LLMs is still +under-satisfactory, and extensive analysis derives some insightful findings +that could inspire future research for harmless LLM research. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Grammatical Gender's Influence on Distributional Semantics: A Causal + Perspective + + +
+ How much meaning influences gender assignment across languages is an active +area of research in modern linguistics and cognitive science. We can view +current approaches as aiming to determine where gender assignment falls on a +spectrum, from being fully arbitrarily determined to being largely semantically +determined. For the latter case, there is a formulation of the neo-Whorfian +hypothesis, which claims that even inanimate noun gender influences how people +conceive of and talk about objects (using the choice of adjective used to +modify inanimate nouns as a proxy for meaning). We offer a novel, causal +graphical model that jointly represents the interactions between a noun's +grammatical gender, its meaning, and adjective choice. In accordance with past +results, we find a relationship between the gender of nouns and the adjectives +which modify them. However, when we control for the meaning of the noun, we +find that grammatical gender has a near-zero effect on adjective choice, +thereby calling the neo-Whorfian hypothesis into question. + +
+
+
+
+
+ + ☆ ESG Accountability Made Easy: DocQA at Your Service AAAI + + +
+ We present Deep Search DocQA. This application enables information extraction +from documents via a question-answering conversational assistant. The system +integrates several technologies from different AI disciplines consisting of +document conversion to machine-readable format (via computer vision), finding +relevant data (via natural language processing), and formulating an eloquent +response (via large language models). Users can explore over 10,000 +Environmental, Social, and Governance (ESG) disclosure reports from over 2000 +corporations. The Deep Search platform can be accessed at: +https://ds4sd.github.io. + +
+
+ comment: Accepted at the Demonstration Track of the 38th Annual AAAI + Conference on Artificial Intelligence (AAAI 24) +
+
+
+
+
+ + ☆ Use of explicit replies as coordination mechanisms in online student + debate + + +
+ People in conversation entrain their linguistic behaviours through +spontaneous alignment mechanisms [7] - both in face-to-face and +computer-mediated communication (CMC) [8]. In CMC, one of the mechanisms +through which linguistic entrainment happens is through explicit replies. +Indeed, the use of explicit replies influences the structure of conversations, +favouring the formation of reply-trees typically delineated by topic shifts +[5]. The interpersonal coordination mechanisms realized by how actors address +each other have been studied using a probabilistic framework proposed by David +Gibson [2,3]. Other recent approaches use computational methods and information +theory to quantify changes in text. We explore coordination mechanisms +concerned with some of the roles utterances play in dialogues - specifically in +explicit replies. We identify these roles by finding community structure in the +conversation's vocabulary using a non-parametric, hierarchical topic model. +Some conversations may always stay on the ground, remaining at the level of +general introductory chatter. Some others may develop a specific sub-topic in +significant depth and detail. Even others may jump between general chatter, +out-of-topic remarks and people agreeing or disagreeing without further +elaboration. + +
+
+
+
+
+ + ☆ IAG: Induction-Augmented Generation Framework for Answering Reasoning + Questions + + +
+ Retrieval-Augmented Generation (RAG), by incorporating external knowledge +with parametric memory of language models, has become the state-of-the-art +architecture for open-domain QA tasks. However, common knowledge bases are +inherently constrained by limited coverage and noisy information, making +retrieval-based approaches inadequate to answer implicit reasoning questions. +In this paper, we propose an Induction-Augmented Generation (IAG) framework +that utilizes inductive knowledge along with the retrieved documents for +implicit reasoning. We leverage large language models (LLMs) for deriving such +knowledge via a novel prompting method based on inductive reasoning patterns. +On top of this, we implement two versions of IAG named IAG-GPT and IAG-Student, +respectively. IAG-GPT directly utilizes the knowledge generated by GPT-3 for +answer prediction, while IAG-Student gets rid of dependencies on GPT service at +inference time by incorporating a student inductor model. The inductor is +firstly trained via knowledge distillation and further optimized by +back-propagating the generator feedback via differentiable beam scores. +Experimental results show that IAG outperforms RAG baselines as well as ChatGPT +on two Open-Domain QA tasks. Notably, our best models have won the first place +in the official leaderboards of CSQA2.0 (since Nov 1, 2022) and StrategyQA +(since Jan 8, 2023). + +
+
+
+
+
+ + ☆ Hubness Reduction Improves Sentence-BERT Semantic Spaces + + +
+ Semantic representations of text, i.e. representations of natural language +which capture meaning by geometry, are essential for areas such as information +retrieval and document grouping. High-dimensional trained dense vectors have +received much attention in recent years as such representations. We investigate +the structure of semantic spaces that arise from embeddings made with +Sentence-BERT and find that the representations suffer from a well-known +problem in high dimensions called hubness. Hubness results in asymmetric +neighborhood relations, such that some texts (the hubs) are neighbours of many +other texts while most texts (so-called anti-hubs), are neighbours of few or no +other texts. We quantify the semantic quality of the embeddings using hubness +scores and error rate of a neighbourhood based classifier. We find that when +hubness is high, we can reduce error rate and hubness using hubness reduction +methods. We identify a combination of two methods as resulting in the best +reduction. For example, on one of the tested pretrained models, this combined +method can reduce hubness by about 75% and error rate by about 9%. Thus, we +argue that mitigating hubness in the embedding space provides better semantic +representations of text. + +
+
+ comment: Accepted at NLDL 2024 +
+
+
+
+
+ + ☆ Evaluating the Rationale Understanding of Critical Reasoning in Logical + Reading Comprehension EMNLP 2023 + + +
+ To precisely evaluate a language model's capability for logical reading +comprehension, we present a dataset for testing the understanding of the +rationale behind critical reasoning. For questions taken from an existing +multiplechoice logical reading comprehension dataset, we crowdsource rationale +texts that explain why we should select or eliminate answer options, resulting +in 3,003 multiple-choice subquestions that are associated with 943 main +questions. Experiments on our dataset show that recent large language models +(e.g., InstructGPT) struggle to answer the subquestions even if they are able +to answer the main questions correctly. We find that the models perform +particularly poorly in answering subquestions written for the incorrect options +of the main questions, implying that the models have a limited capability for +explaining why incorrect alternatives should be eliminated. These results +suggest that our dataset encourages further investigation into the critical +reasoning ability of language models while focusing on the elimination process +of relevant alternatives. + +
+
+ comment: Accepted to EMNLP 2023 +
+
+
+
+
+ + ☆ Consensus, dissensus and synergy between clinicians and specialist + foundation models in radiology report generation + + +
+ Radiology reports are an instrumental part of modern medicine, informing key +clinical decisions such as diagnosis and treatment. The worldwide shortage of +radiologists, however, restricts access to expert care and imposes heavy +workloads, contributing to avoidable errors and delays in report delivery. +While recent progress in automated report generation with vision-language +models offer clear potential in ameliorating the situation, the path to +real-world adoption has been stymied by the challenge of evaluating the +clinical quality of AI-generated reports. In this study, we build a +state-of-the-art report generation system for chest radiographs, Flamingo-CXR, +by fine-tuning a well-known vision-language foundation model on radiology data. +To evaluate the quality of the AI-generated reports, a group of 16 certified +radiologists provide detailed evaluations of AI-generated and human written +reports for chest X-rays from an intensive care setting in the United States +and an inpatient setting in India. At least one radiologist (out of two per +case) preferred the AI report to the ground truth report in over 60$\%$ of +cases for both datasets. Amongst the subset of AI-generated reports that +contain errors, the most frequently cited reasons were related to the location +and finding, whereas for human written reports, most mistakes were related to +severity and finding. This disparity suggested potential complementarity +between our AI system and human experts, prompting us to develop an assistive +scenario in which Flamingo-CXR generates a first-draft report, which is +subsequently revised by a clinician. This is the first demonstration of +clinician-AI collaboration for report writing, and the resultant reports are +assessed to be equivalent or preferred by at least one radiologist to reports +written by experts alone in 80$\%$ of in-patient cases and 66$\%$ of intensive +care cases. + +
+
+
+
+
+ + ☆ mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large + Language Model + + +
+ Recently, the strong text creation ability of Large Language Models(LLMs) has +given rise to many tools for assisting paper reading or even writing. However, +the weak diagram analysis abilities of LLMs or Multimodal LLMs greatly limit +their application scenarios, especially for scientific academic paper writing. +In this work, towards a more versatile copilot for academic paper writing, we +mainly focus on strengthening the multi-modal diagram analysis ability of +Multimodal LLMs. By parsing Latex source files of high-quality papers, we +carefully build a multi-modal diagram understanding dataset M-Paper. By +aligning diagrams in the paper with related paragraphs, we construct +professional diagram analysis samples for training and evaluation. M-Paper is +the first dataset to support joint comprehension of multiple scientific +diagrams, including figures and tables in the format of images or Latex codes. +Besides, to better align the copilot with the user's intention, we introduce +the `outline' as the control signal, which could be directly given by the user +or revised based on auto-generated ones. Comprehensive experiments with a +state-of-the-art Mumtimodal LLM demonstrate that training on our dataset shows +stronger scientific diagram understanding performance, including diagram +captioning, diagram analysis, and outline recommendation. The dataset, code, +and model are available at +https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl. + +
+
+ comment: 20 pages, 12 figures. arXiv admin note: text overlap with + arXiv:2305.15225 by other authors +
+
+
+
+
+ + ☆ LMRL Gym: Benchmarks for Multi-Turn Reinforcement Learning with Language + Models + + +
+ Large language models (LLMs) provide excellent text-generation capabilities, +but standard prompting and generation methods generally do not lead to +intentional or goal-directed agents and might necessitate considerable prompt +tuning. This becomes particularly apparent in multi-turn conversations: even +the best current LLMs rarely ask clarifying questions, engage in explicit +information gathering, or take actions now that lead to better decisions after +multiple turns. Reinforcement learning has the potential to leverage the +powerful modeling capabilities of LLMs, as well as their internal +representation of textual interactions, to create capable goal-directed +language agents. This can enable intentional and temporally extended +interactions, such as with humans, through coordinated persuasion and carefully +crafted questions, or in goal-directed play through text games to bring about +desired final outcomes. However, enabling this requires the community to +develop stable and reliable reinforcement learning algorithms that can +effectively train LLMs. Developing such algorithms requires tasks that can +gauge progress on algorithm design, provide accessible and reproducible +evaluations for multi-turn interactions, and cover a range of task properties +and challenges in improving reinforcement learning algorithms. Our paper +introduces the LMRL-Gym benchmark for evaluating multi-turn RL for LLMs, +together with an open-source research framework containing a basic toolkit for +getting started on multi-turn RL with offline value-based and policy-based RL +methods. Our benchmark consists of 8 different language tasks, which require +multiple rounds of language interaction and cover a range of tasks in +open-ended dialogue and text games. + +
+
+
+
+
+ + ☆ Automatic Construction of a Korean Toxic Instruction Dataset for Ethical + Tuning of Large Language Models NeurIPS 2023 + + +
+ Caution: this paper may include material that could be offensive or +distressing. + The advent of Large Language Models (LLMs) necessitates the development of +training approaches that mitigate the generation of unethical language and +aptly manage toxic user queries. Given the challenges related to human labor +and the scarcity of data, we present KoTox, comprising 39K unethical +instruction-output pairs. This collection of automatically generated toxic +instructions refines the training of LLMs and establishes a foundational +framework for improving LLMs' ethical awareness and response to various toxic +inputs, promoting more secure and responsible interactions in Natural Language +Processing (NLP) applications. + +
+
+ comment: NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following +
+
+
+
+
+ + ☆ INarIG: Iterative Non-autoregressive Instruct Generation Model For + Word-Level Auto Completion EMNLP2023 + + +
+ Computer-aided translation (CAT) aims to enhance human translation efficiency +and is still important in scenarios where machine translation cannot meet +quality requirements. One fundamental task within this field is Word-Level Auto +Completion (WLAC). WLAC predicts a target word given a source sentence, +translation context, and a human typed character sequence. Previous works +either employ word classification models to exploit contextual information from +both sides of the target word or directly disregarded the dependencies from the +right-side context. Furthermore, the key information, i.e. human typed +sequences, is only used as prefix constraints in the decoding module. In this +paper, we propose the INarIG (Iterative Non-autoregressive Instruct Generation) +model, which constructs the human typed sequence into Instruction Unit and +employs iterative decoding with subwords to fully utilize input information +given in the task. Our model is more competent in dealing with low-frequency +words (core scenario of this task), and achieves state-of-the-art results on +the WMT22 and benchmark datasets, with a maximum increase of over 10% +prediction accuracy. + +
+
+ comment: EMNLP2023 +
+
+
+
+
+ + ☆ COVID-19 Vaccine Misinformation in Middle Income Countries EMNLP 2023 + + +
+ This paper introduces a multilingual dataset of COVID-19 vaccine +misinformation, consisting of annotated tweets from three middle-income +countries: Brazil, Indonesia, and Nigeria. The expertly curated dataset +includes annotations for 5,952 tweets, assessing their relevance to COVID-19 +vaccines, presence of misinformation, and the themes of the misinformation. To +address challenges posed by domain specificity, the low-resource setting, and +data imbalance, we adopt two approaches for developing COVID-19 vaccine +misinformation detection models: domain-specific pre-training and text +augmentation using a large language model. Our best misinformation detection +models demonstrate improvements ranging from 2.7 to 15.9 percentage points in +macro F1-score compared to the baseline models. Additionally, we apply our +misinformation detection models in a large-scale study of 19 million unlabeled +tweets from the three countries between 2020 and 2022, showcasing the practical +application of our dataset and models for detecting and analyzing vaccine +misinformation in multiple countries and languages. Our analysis indicates that +percentage changes in the number of new COVID-19 cases are positively +associated with COVID-19 vaccine misinformation rates in a staggered manner for +Brazil and Indonesia, and there are significant positive associations between +the misinformation rates across the three countries. + +
+
+ comment: Accepted to EMNLP 2023 (Main conference), 9 pages, 5 figures +
+
+
+
+
+ + ☆ Positional Information Matters for Invariant In-Context Learning: A Case + Study of Simple Function Classes + + +
+ In-context learning (ICL) refers to the ability of a model to condition on a +few in-context demonstrations (input-output examples of the underlying task) to +generate the answer for a new query input, without updating parameters. Despite +the impressive ICL ability of LLMs, it has also been found that ICL in LLMs is +sensitive to input demonstrations and limited to short context lengths. To +understand the limitations and principles for successful ICL, we conduct an +investigation with ICL linear regression of transformers. We characterize +several Out-of-Distribution (OOD) cases for ICL inspired by realistic LLM ICL +failures and compare transformers with DeepSet, a simple yet powerful +architecture for ICL. Surprisingly, DeepSet outperforms transformers across a +variety of distribution shifts, implying that preserving permutation invariance +symmetry to input demonstrations is crucial for OOD ICL. The phenomenon +specifies a fundamental requirement by ICL, which we termed as ICL invariance. +Nevertheless, the positional encodings in LLMs will break ICL invariance. To +this end, we further evaluate transformers with identical positional encodings +and find preserving ICL invariance in transformers achieves state-of-the-art +performance across various ICL distribution shifts + +
+
+ comment: Ongoing work; preliminary version +
+
+
+
+
+ + ☆ mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large + Language Model + + +
+ Recently, the strong text creation ability of Large Language Models(LLMs) has +given rise to many tools for assisting paper reading or even writing. However, +the weak diagram analysis abilities of LLMs or Multimodal LLMs greatly limit +their application scenarios, especially for scientific academic paper writing. +In this work, towards a more versatile copilot for academic paper writing, we +mainly focus on strengthening the multi-modal diagram analysis ability of +Multimodal LLMs. By parsing Latex source files of high-quality papers, we +carefully build a multi-modal diagram understanding dataset M-Paper. By +aligning diagrams in the paper with related paragraphs, we construct +professional diagram analysis samples for training and evaluation. M-Paper is +the first dataset to support joint comprehension of multiple scientific +diagrams, including figures and tables in the format of images or Latex codes. +Besides, to better align the copilot with the user's intention, we introduce +the `outline' as the control signal, which could be directly given by the user +or revised based on auto-generated ones. Comprehensive experiments with a +state-of-the-art Mumtimodal LLM demonstrate that training on our dataset shows +stronger scientific diagram understanding performance, including diagram +captioning, diagram analysis, and outline recommendation. The dataset, code, +and model are available at +https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis + + +
+ Hallucinations and unfaithful synthesis due to inaccurate prompts with +insufficient semantic details are widely observed in multimodal generative +models. A prevalent strategy to align multiple modalities is to fine-tune the +generator with a large number of annotated text-image pairs. However, such a +procedure is labor-consuming and resource-draining. The key question we ask is: +can we enhance the quality and faithfulness of text-driven generative models +beyond extensive text-image pair annotations? To address this question, we +propose Knowledge Pursuit Prompting (KPP), a zero-shot framework that +iteratively incorporates external knowledge to help generators produce reliable +visual content. Instead of training generators to handle generic prompts, KPP +employs a recursive knowledge query process to gather informative external +facts from the knowledge base, instructs a language model to compress the +acquired knowledge for prompt refinement, and utilizes text-driven generators +for visual synthesis. The entire process is zero-shot, without accessing the +architectures and parameters of generative models. We evaluate the framework +across multiple text-driven generative tasks (image, 3D rendering, and video) +on datasets of different domains. We further demonstrate the extensibility and +adaptability of KPP through varying foundation model bases and instructions. +Our results show that KPP is capable of generating faithful and semantically +rich content across diverse visual domains, offering a promising solution to +improve multimodal generative models. + +
+
+
+
+
+ + ♻ ☆ A Baseline Analysis of Reward Models' Ability To Accurately Analyze + Foundation Models Under Distribution Shift + + +
+ Foundation models, specifically Large Language Models (LLM's), have lately +gained wide-spread attention and adoption. Reinforcement Learning with Human +Feedback (RLHF) involves training a reward model to capture desired behaviors, +which is then used to align an LLM. These reward models are additionally used +at inference-time to estimate how well LLM responses adhere to those desired +behaviors. However, there is little work measuring how robust these reward +models are to distribution shifts. In this work, we evaluate how reward model +performance - measured via accuracy and calibration (i.e. alignment between +accuracy and confidence) - is affected by distribution shift. We show novel +calibration patterns and accuracy drops due to OOD prompts and responses, and +that the reward model is more sensitive to shifts in responses than prompts. +Additionally, we adapt an OOD detection technique commonly used in +classification to the reward model setting in order to detect these +distribution shifts in prompts and responses. + +
+
+
+
+
+ + ♻ ☆ Locally Differentially Private Document Generation Using Zero Shot + Prompting EMNLP 2023 + + +
+ Numerous studies have highlighted the privacy risks associated with +pretrained large language models. In contrast, our research offers a unique +perspective by demonstrating that pretrained large language models can +effectively contribute to privacy preservation. We propose a locally +differentially private mechanism called DP-Prompt, which leverages the power of +pretrained large language models and zero-shot prompting to counter author +de-anonymization attacks while minimizing the impact on downstream utility. +When DP-Prompt is used with a powerful language model like ChatGPT (gpt-3.5), +we observe a notable reduction in the success rate of de-anonymization attacks, +showing that it surpasses existing approaches by a considerable margin despite +its simpler design. For instance, in the case of the IMDB dataset, DP-Prompt +(with ChatGPT) perfectly recovers the clean sentiment F1 score while achieving +a 46\% reduction in author identification F1 score against static attackers and +a 26\% reduction against adaptive attackers. We conduct extensive experiments +across six open-source large language models, ranging up to 7 billion +parameters, to analyze various effects of the privacy-utility tradeoff. + +
+
+ comment: Accepted at EMNLP 2023 (Findings) +
+
+
+
+
+ + ♻ ☆ Focused Transformer: Contrastive Training for Context Scaling NeurIPS 2023 + + +
+ Large language models have an exceptional capability to incorporate new +information in a contextual manner. However, the full potential of such an +approach is often restrained due to a limitation in the effective context +length. One solution to this issue is to endow an attention layer with access +to an external memory, which comprises of (key, value) pairs. Yet, as the +number of documents increases, the proportion of relevant keys to irrelevant +ones decreases, leading the model to focus more on the irrelevant keys. We +identify a significant challenge, dubbed the distraction issue, where keys +linked to different semantic values might overlap, making them hard to +distinguish. To tackle this problem, we introduce the Focused Transformer +(FoT), a technique that employs a training process inspired by contrastive +learning. This novel approach enhances the structure of the (key, value) space, +enabling an extension of the context length. Our method allows for fine-tuning +pre-existing, large-scale models to lengthen their effective context. This is +demonstrated by our fine-tuning of $3B$ and $7B$ OpenLLaMA checkpoints. The +resulting models, which we name LongLLaMA, exhibit advancements in tasks +requiring a long context. We further illustrate that our LongLLaMA models +adeptly manage a $256 k$ context length for passkey retrieval. + +
+
+ comment: Accepted at 37th Conference on Neural Information Processing Systems + (NeurIPS 2023). 28 pages, 10 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Supporting Human-AI Collaboration in Auditing LLMs with LLMs + + +
+ Large language models are becoming increasingly pervasive and ubiquitous in +society via deployment in sociotechnical systems. Yet these language models, be +it for classification or generation, have been shown to be biased and behave +irresponsibly, causing harm to people at scale. It is crucial to audit these +language models rigorously. Existing auditing tools leverage either or both +humans and AI to find failures. In this work, we draw upon literature in +human-AI collaboration and sensemaking, and conduct interviews with research +experts in safe and fair AI, to build upon the auditing tool: AdaTest (Ribeiro +and Lundberg, 2022), which is powered by a generative large language model +(LLM). Through the design process we highlight the importance of sensemaking +and human-AI communication to leverage complementary strengths of humans and +generative models in collaborative auditing. To evaluate the effectiveness of +the augmented tool, AdaTest++, we conduct user studies with participants +auditing two commercial language models: OpenAI's GPT-3 and Azure's sentiment +analysis model. Qualitative analysis shows that AdaTest++ effectively leverages +human strengths such as schematization, hypothesis formation and testing. +Further, with our tool, participants identified a variety of failures modes, +covering 26 different topics over 2 tasks, that have been shown before in +formal audits and also those previously under-reported. + +
+
+ comment: 21 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ ANPL: Towards Natural Programming with Interactive Decomposition + + +
+ Though LLMs are capable of generating plausible programs, it's challenging to +interact with the LLMs further to revise the program, especially if the user's +specific requirements are different from the initial proposal. In this paper, +we introduce ANPL, an interactive programming system that ensures users can +always refine the generated code towards their specific programmatic intents +via structured decompositions. Borrowing the paradigm of sketching from program +synthesis, an ANPL program consists of a set of input-outputs that it must +satisfy, a ``sketch'' -- control/data flow expressed in precise code (e.g. +Python), and ``holes'' -- sub-modules to be implemented by the LLM specified +with natural language. The user revises an ANPL program by either modifying the +sketch, changing the language used to describe the holes, or providing +additional input-outputs to a particular hole, turning it into a sub-ANPL +program that can be solved recursively. This workflow allows the users to +offload programming burdens to the LLM as much as possible while retaining the +ability to pinpoint and resolve bugs locally, without exposing the rest of the +program to the LLM. We deploy ANPL on the Abstraction and Reasoning Corpus +(ARC), a set of unique tasks that are challenging for state-of-the-art AI +systems, showing it outperforms baseline programming systems that (a) without +the ability to decompose tasks interactively and (b) without the guarantee that +the modules can be correctly composed together. Additional evaluations on APPS, +HumanEval, and real-world programming tasks have validated that the ANPL +framework is applicable to multiple programming domains. We release the ANPL +solutions to the ARC tasks as a dataset, providing insights into how humans +decompose novel tasks programmatically. See our code at +https://iprc-dip.github.io/ANPL/. + +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a comprehensive instruction dataset +designed for the biomolecular domain. Mol-Instructions encompasses three key +components: molecule-oriented instructions, protein-oriented instructions, and +biomolecular text instructions. Each component aims to improve the +understanding and prediction capabilities of LLMs concerning biomolecular +features and behaviors. Through extensive instruction tuning experiments on +LLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large +models' performance in the intricate realm of biomolecular studies, thus +fostering progress in the biomolecular research community. Mol-Instructions is +publicly available for ongoing research and will undergo regular updates to +enhance its applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions, add + more experiments +
+
+
+
+
+ + ♻ ☆ Controlling Pre-trained Language Models for Grade-Specific Text + Simplification EMNLP 2023 + + +
+ Text simplification (TS) systems rewrite text to make it more readable while +preserving its content. However, what makes a text easy to read depends on the +intended readers. Recent work has shown that pre-trained language models can +simplify text using a wealth of techniques to control output simplicity, +ranging from specifying only the desired reading grade level, to directly +specifying low-level edit operations. Yet it remains unclear how to set these +control parameters in practice. Existing approaches set them at the corpus +level, disregarding the complexity of individual inputs and considering only +one level of output complexity. In this work, we conduct an empirical study to +understand how different control mechanisms impact the adequacy and simplicity +of text simplification systems. Based on these insights, we introduce a simple +method that predicts the edit operations required for simplifying a text for a +specific grade level on an instance-per-instance basis. This approach improves +the quality of the simplified outputs over corpus-level search-based +heuristics. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Generating More Pertinent Captions by Leveraging Semantics and Style on + Multi-Source Datasets + + +
+ This paper addresses the task of generating fluent descriptions by training +on a non-uniform combination of data sources, containing both human-annotated +and web-collected captions. Large-scale datasets with noisy image-text pairs, +indeed, provide a sub-optimal source of supervision because of their +low-quality descriptive style, while human-annotated datasets are cleaner but +smaller in scale. To get the best of both worlds, we propose to leverage and +separate semantics and descriptive style through the incorporation of a style +token and keywords extracted through a retrieval component. The proposed model +avoids the need of object detectors, is trained with a single objective of +prompt language modeling, and can replicate the style of human-collected +captions while training on sources with different input styles. Experimentally, +the model shows a strong capability of recognizing real-world concepts and +producing high-quality captions. Extensive experiments are performed on +different image captioning datasets, including CC3M, nocaps, and the +competitive COCO dataset, where our model consistently outperforms baselines +and state-of-the-art approaches. + +
+
+ comment: Accepted to IJCV +
+
+
+
+
+ + ♻ ☆ Analyzing Semantic Faithfulness of Language Models via Input + Intervention on Question Answering + + +
+ Transformer-based language models have been shown to be highly effective for +several NLP tasks. In this paper, we consider three transformer models, BERT, +RoBERTa, and XLNet, in both small and large versions, and investigate how +faithful their representations are with respect to the semantic content of +texts. We formalize a notion of semantic faithfulness, in which the semantic +content of a text should causally figure in a model's inferences in question +answering. We then test this notion by observing a model's behavior on +answering questions about a story after performing two novel semantic +interventions: deletion intervention and negation intervention. While +transformer models achieve high performance on standard question answering +tasks, we show that they fail to be semantically faithful once we perform these +interventions for a significant number of cases (~50% for deletion +intervention, and ~20% drop in accuracy for negation intervention). We then +propose an intervention-based training regime that can mitigate the undesirable +effects for deletion intervention by a significant margin (from ~ 50% to ~6%). +We analyze the inner-workings of the models to better understand the +effectiveness of intervention-based training for deletion intervention. But we +show that this training does not attenuate other aspects of semantic +unfaithfulness such as the models' inability to deal with negation intervention +or to capture the predicate-argument structure of texts. We also test +InstructGPT, via prompting, for its ability to handle the two interventions and +to capture predicate-argument structure. While InstructGPT models do achieve +very high performance on predicate-argument structure task, they fail to +respond adequately to our deletion and negation interventions. + +
+
+
+
+
+ + ♻ ☆ Language Models as Black-Box Optimizers for Vision-Language Models + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities on downstream tasks when fine-tuned with +minimal data. However, many VLMs rely on proprietary data and are not +open-source, which restricts the use of white-box approaches for fine-tuning. +As such, we aim to develop a black-box approach to optimize VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or even output logits. We propose employing chat-based LLMs +to search for the best text prompt for VLMs. Specifically, we adopt an +automatic hill-climbing procedure that converges to an effective prompt by +evaluating the performance of current prompts and asking LLMs to refine them +based on textual feedback, all within a conversational process without +human-in-the-loop. In a challenging 1-shot image classification setup, our +simple approach surpasses the white-box continuous prompting method (CoOp) by +an average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms both human-engineered and LLM-generated prompts. We highlight the +advantage of conversational feedback that incorporates both positive and +negative prompts, suggesting that LLMs can utilize the implicit gradient +direction in textual feedback for a more efficient search. In addition, we find +that the text prompts generated through our strategy are not only more +interpretable but also transfer well across different VLM architectures in a +black-box manner. Lastly, we demonstrate our framework on a state-of-the-art +black-box VLM (DALL-E 3) for text-to-image optimization. + +
+
+ comment: Project site: llm-can-optimize-vlm.github.io +
+
+
+
+
+ + ♻ ☆ Temporal Information Extraction by Predicting Relative Time-lines EMNLP 2018 + + +
+ The current leading paradigm for temporal information extraction from text +consists of three phases: (1) recognition of events and temporal expressions, +(2) recognition of temporal relations among them, and (3) time-line +construction from the temporal relations. In contrast to the first two phases, +the last phase, time-line construction, received little attention and is the +focus of this work. In this paper, we propose a new method to construct a +linear time-line from a set of (extracted) temporal relations. But more +importantly, we propose a novel paradigm in which we directly predict start and +end-points for events from the text, constituting a time-line without going +through the intermediate step of prediction of temporal relations as in earlier +work. Within this paradigm, we propose two models that predict in linear +complexity, and a new training loss using TimeML-style annotations, yielding +promising results. + +
+
+ comment: Accepted at the Conference on Empirical Methods in Natural Language + Processing (EMNLP 2018). Small correction in Eq. 6 on 30 Nov. 2023 +
+
+
+
+
+ + ♻ ☆ On Context Utilization in Summarization with Large Language Models + + +
+ Large language models (LLMs) excel in zero-shot abstractive summarization +tasks, delivering fluent and pertinent summaries. Recent advancements have +extended their capabilities to handle long-input contexts, surpassing token +limits of 100k. However, in the realm of multi-document question answering, +language models exhibit uneven utilization of their input context. They tend to +favor the initial and final segments, resulting in a U-shaped performance +pattern concerning where the answer is located within the input. This bias +raises concerns, particularly in summarization tasks where crucial content may +be dispersed throughout the source document(s). This paper presents a +comprehensive investigation encompassing 10 datasets, 5 LLMs, and 5 evaluation +metrics to analyze how these models leverage their input for abstractive +summarization. Our findings reveal a pronounced bias towards the introductory +content (and to a lesser extent, the final content), posing challenges for LLM +performance across a range of diverse summarization benchmarks. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ Predicting Emergent Abilities with Infinite Resolution Evaluation + + +
+ The scientific scale-up of large language models (LLMs) necessitates a +comprehensive understanding of their scaling properties. However, the existing +literature on the scaling properties only yields an incomplete answer: +optimization loss decreases predictably as the model size increases, in line +with established scaling law; yet no scaling law for task has been established +and the task performances are far from predictable during scaling. Task +performances typically show minor gains on small models until they improve +dramatically once models exceed a size threshold, exemplifying the ``emergent +abilities''. In this study, we discover that small models, although they +exhibit minor performance, demonstrate critical and consistent task performance +improvements that are not captured by conventional evaluation strategies due to +insufficient measurement resolution. To measure such improvements, we introduce +PassUntil, an evaluation strategy with theoretically infinite resolution, +through massive sampling in the decoding phase. With PassUntil, we conduct a +quantitative investigation into the scaling law of task performance. The +investigation contains two parts. Firstly, a strict task scaling law that is +not conventionally known to exist, is identified, enhancing the predictability +of task performances. Remarkably, we are able to predict the performance of the +2.4B model on code generation with merely 0.05\% deviation before training +starts, which is the first systematic attempt to verify predictable scaling +proposed by GPT-4's report. Secondly, we are able to study emergent abilities +quantitatively. We identify a kind of accelerated emergence whose scaling curve +cannot be fitted by standard scaling law function and has a increasing speed. +We then examine two hypothesis and imply that the ``multiple circuits +hypothesis'' might be responsible for the accelerated emergence. + +
+
+ comment: After revision +
+
+
+
+
+ + ♻ ☆ Editing Large Language Models: Problems, Methods, and Opportunities EMNLP 2023 + + +
+ Despite the ability to train capable LLMs, the methodology for maintaining +their relevancy and rectifying errors remains elusive. To this end, the past +few years have witnessed a surge in techniques for editing LLMs, the objective +of which is to efficiently alter the behavior of LLMs within a specific domain +without negatively impacting performance across other inputs. This paper +embarks on a deep exploration of the problems, methods, and opportunities +related to model editing for LLMs. In particular, we provide an exhaustive +overview of the task definition and challenges associated with model editing, +along with an in-depth empirical analysis of the most progressive methods +currently at our disposal. We also build a new benchmark dataset to facilitate +a more robust evaluation and pinpoint enduring issues intrinsic to existing +techniques. Our objective is to provide valuable insights into the +effectiveness and feasibility of each editing technique, thereby assisting the +community in making informed decisions on the selection of the most appropriate +method for a specific task or context. Code and datasets are available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023. Updated with new experiments +
+
+
+
+
+ + ♻ ☆ Unified Segment-to-Segment Framework for Simultaneous Sequence + Generation NeurIPS 2023 + + +
+ Simultaneous sequence generation is a pivotal task for real-time scenarios, +such as streaming speech recognition, simultaneous machine translation and +simultaneous speech translation, where the target sequence is generated while +receiving the source sequence. The crux of achieving high-quality generation +with low latency lies in identifying the optimal moments for generating, +accomplished by learning a mapping between the source and target sequences. +However, existing methods often rely on task-specific heuristics for different +sequence types, limiting the model's capacity to adaptively learn the +source-target mapping and hindering the exploration of multi-task learning for +various simultaneous tasks. In this paper, we propose a unified +segment-to-segment framework (Seg2Seg) for simultaneous sequence generation, +which learns the mapping in an adaptive and unified manner. During the process +of simultaneous generation, the model alternates between waiting for a source +segment and generating a target segment, making the segment serve as the +natural bridge between the source and target. To accomplish this, Seg2Seg +introduces a latent segment as the pivot between source to target and explores +all potential source-target mappings via the proposed expectation training, +thereby learning the optimal moments for generating. Experiments on multiple +simultaneous generation tasks demonstrate that Seg2Seg achieves +state-of-the-art performance and exhibits better generality across various +tasks. + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Language Models as a Service: Overview of a New Paradigm and its + Challenges + + +
+ Some of the most powerful language models currently are proprietary systems, +accessible only via (typically restrictive) web or software programming +interfaces. This is the Language-Models-as-a-Service (LMaaS) paradigm. In +contrast with scenarios where full model access is available, as in the case of +open-source models, such closed-off language models present specific challenges +for evaluating, benchmarking, and testing them. This paper has two goals: on +the one hand, we delineate how the aforementioned challenges act as impediments +to the accessibility, replicability, reliability, and trustworthiness of LMaaS. +We systematically examine the issues that arise from a lack of information +about language models for each of these four aspects. We conduct a detailed +analysis of existing solutions and put forth a number of considered +recommendations, and highlight the directions for future advancements. On the +other hand, it serves as a comprehensive resource for existing knowledge on +current, major LMaaS, offering a synthesized overview of the licences and +capabilities their interfaces offer. + +
+
+
+
+
+ + ♻ ☆ Handwriting recognition and automatic scoring for descriptive answers in + Japanese language tests + + +
+ This paper presents an experiment of automatically scoring handwritten +descriptive answers in the trial tests for the new Japanese university entrance +examination, which were made for about 120,000 examinees in 2017 and 2018. +There are about 400,000 answers with more than 20 million characters. Although +all answers have been scored by human examiners, handwritten characters are not +labeled. We present our attempt to adapt deep neural network-based handwriting +recognizers trained on a labeled handwriting dataset into this unlabeled answer +set. Our proposed method combines different training strategies, ensembles +multiple recognizers, and uses a language model built from a large general +corpus to avoid overfitting into specific data. In our experiment, the proposed +method records character accuracy of over 97% using about 2,000 verified +labeled answers that account for less than 0.5% of the dataset. Then, the +recognized answers are fed into a pre-trained automatic scoring system based on +the BERT model without correcting misrecognized characters and providing rubric +annotations. The automatic scoring system achieves from 0.84 to 0.98 of +Quadratic Weighted Kappa (QWK). As QWK is over 0.8, it represents an acceptable +similarity of scoring between the automatic scoring system and the human +examiners. These results are promising for further research on end-to-end +automatic scoring of descriptive answers. + +
+
+ comment: Keywords: handwritten Japanese answers, handwriting recognition, + automatic scoring, ensemble recognition, deep neural networks; Reported in + IEICE technical report, PRMU2021-32, pp.45-50 (2021.12) Published after peer + review and Presented in ICFHR2022, Lecture Notes in Computer Science, vol. + 13639, pp. 274-284 (2022.11) +
+
+
+
+
+ + ♻ ☆ Multi-turn Response Selection using Dialogue Dependency Relations EMNLP2020 + + +
+ Multi-turn response selection is a task designed for developing dialogue +agents. The performance on this task has a remarkable improvement with +pre-trained language models. However, these models simply concatenate the turns +in dialogue history as the input and largely ignore the dependencies between +the turns. In this paper, we propose a dialogue extraction algorithm to +transform a dialogue history into threads based on their dependency relations. +Each thread can be regarded as a self-contained sub-dialogue. We also propose +Thread-Encoder model to encode threads and candidates into compact +representations by pre-trained Transformers and finally get the matching score +through an attention layer. The experiments show that dependency relations are +helpful for dialogue context understanding, and our model outperforms the +state-of-the-art baselines on both DSTC7 and DSTC8*, with competitive results +on UbuntuV2. + +
+
+ comment: Accepted for publication as a long paper in EMNLP2020 +
+
+
+
+
+ + ♻ ☆ How to Build an AI Tutor that Can Adapt to Any Course and Provide + Accurate Answers Using Large Language Model and Retrieval-Augmented + Generation + + +
+ Artificial intelligence is transforming education through data-driven, +personalized learning solutions. This paper introduces AI Tutor, an innovative +web application that provides personalized tutoring in any subject using +state-of-the-art Large Language Model (LLM). AI Tutor ingests course materials +to construct an adaptive knowledge base tailored to the course. When students +pose questions, it retrieves the most relevant information and generates +detailed, conversational responses citing supporting evidence. The system is +powered by advanced large language models and Retrieval-Augmented Generation +(RAG) techniques for accurate, natural question answering. We present a +fully-functional web interface and video demonstration that showcase AI Tutor's +versatility across diverse subjects and its ability to produce pedagogically +cogent responses. While an initial prototype, this work represents a pioneering +step toward AI-enabled tutoring systems that can democratize access to +high-quality, customized educational support. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ CLOMO: Counterfactual Logical Modification with Large Language Models + + +
+ In this study, we delve into the realm of counterfactual reasoning +capabilities of large language models (LLMs). Our primary objective is to +cultivate the counterfactual thought processes within LLMs and rigorously +assess these processes for their validity. Specifically, we introduce a novel +task, Counterfactual Logical Modification (CLOMO), and a high-quality +human-annotated benchmark. In this task, LLMs must adeptly alter a given +argumentative text to uphold a predetermined logical relationship. To +effectively evaluate a generation model's counterfactual capabilities, we +propose an innovative evaluation metric, the LogicAware Counterfactual Score to +directly evaluate the natural language output of LLMs instead of modeling the +task as a multiple-choice problem. Analysis shows that the proposed automatic +metric aligns well with human preference. Our experimental results show that +while LLMs demonstrate a notable capacity for logical counterfactual thinking, +there remains a discernible gap between their current abilities and human +performance. + +
+
+
+
+
+ + ♻ ☆ SELF: Language-Driven Self-Evolution for Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable versatility across +various domains. To further advance LLMs, we propose 'SELF' (Self-Evolution +with Language Feedback), a novel approach that enables LLMs to self-improve +through self-reflection, akin to human learning processes. SELF initiates with +a meta-skill learning process that equips the LLMs with capabilities for +self-feedback and self-refinement. Subsequently, the model undergoes an +iterative process of self-evolution. In each iteration, it utilizes an +unlabeled dataset of instructions to generate initial responses. These +responses are enhanced through self-feedback and self-refinement. The model is +then fine-tuned using this enhanced data. The model undergoes progressive +improvement through this iterative self-evolution process. Moreover, the SELF +framework enables the model to apply self-refinement during inference, which +further improves response quality. Our experiments in mathematics and general +tasks demonstrate that SELF can enhance the capabilities of LLMs without human +intervention. The SELF framework indicates a promising direction for the +autonomous evolution of LLMs, transitioning them from passive information +receivers to active participants in their development. + +
+
+ comment: 17 pages, 4 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Improving the Robustness of Transformer-based Large Language Models with + Dynamic Attention + + +
+ Transformer-based models, such as BERT and GPT, have been widely adopted in +natural language processing (NLP) due to their exceptional performance. +However, recent studies show their vulnerability to textual adversarial attacks +where the model's output can be misled by intentionally manipulating the text +inputs. Despite various methods that have been proposed to enhance the model's +robustness and mitigate this vulnerability, many require heavy consumption +resources (e.g., adversarial training) or only provide limited protection +(e.g., defensive dropout). In this paper, we propose a novel method called +dynamic attention, tailored for the transformer architecture, to enhance the +inherent robustness of the model itself against various adversarial attacks. +Our method requires no downstream task knowledge and does not incur additional +costs. The proposed dynamic attention consists of two modules: (I) attention +rectification, which masks or weakens the attention value of the chosen tokens, +and (ii) dynamic modeling, which dynamically builds the set of candidate +tokens. Extensive experiments demonstrate that dynamic attention significantly +mitigates the impact of adversarial attacks, improving up to 33\% better +performance than previous methods against widely-used adversarial attacks. The +model-level design of dynamic attention enables it to be easily combined with +other defense methods (e.g., adversarial training) to further enhance the +model's robustness. Furthermore, we demonstrate that dynamic attention +preserves the state-of-the-art robustness space of the original model compared +to other dynamic modeling methods. + +
+
+
+
+
+ + ♻ ☆ Do pretrained Transformers Really Learn In-context by Gradient Descent? + + +
+ The emergence of In-Context Learning (ICL) in LLMs remains a significant +phenomenon with little understanding. To explain ICL, recent studies try to +shed light on ICL by connecting it to Gradient Descent (GD). However, the +question is, do these hold up in practice in actual pre-trained models? + We highlight the limiting assumptions in prior works that make their context +considerably different from the practical context in which language models are +trained. For example, the theoretical hand-constructed weights used in these +studies have properties that don't match those of real LLMs. Furthermore, their +experimental verification uses \emph{ICL objective} (training models explicitly +for ICL), which differs from the emergent ICL in the wild. + We also look for evidence in real models. We observe that ICL and GD have +different sensitivity to the order in which they observe demonstrations. +Finally, we probe and compare the ICL vs. GD hypothesis in a natural setting. +We conduct comprehensive empirical analyses on language models pre-trained on +natural data (LLaMa-7B). Our comparisons of three performance metrics highlight +the inconsistent behavior of ICL and GD as a function of various factors such +as datasets, models, and the number of demonstrations. We observe that ICL and +GD modify the output distribution of language models differently. These results +indicate that the equivalence between ICL and GD remains an open hypothesis and +calls for further studies. + +
+
+
+
+
+ + ♻ ☆ Probing Quantifier Comprehension in Large Language Models: Another + Example of Inverse Scaling EMNLP 2023 + + +
+ With their increasing size, large language models (LLMs) are becoming +increasingly good at language understanding tasks. But even with high +performance on specific downstream task, LLMs fail at simple linguistic tests +for negation or quantifier understanding. Previous work on quantifier +understanding in LLMs show inverse scaling in understanding few-type +quantifiers. In this paper, we question the claims of of previous work and show +that it is a result of inappropriate testing methodology. We also present +alternate methods to measure quantifier comprehension in LLMs and show that +LLMs are able to better understand the difference between the meaning of +few-type and most-type quantifiers as their size increases, although they are +not particularly good at it. We also observe inverse scaling for most-type +quantifier understanding, which is contrary to human psycho-linguistic +experiments and previous work, where the model's understanding of most-type +quantifier gets worse as the model size increases. We do this evaluation on +models ranging from 125M-175B parameters, which suggests that LLMs do not do as +well as expected with quantifiers. We also discuss the possible reasons for +this and the relevance of quantifier understanding in evaluating language +understanding in LLMs. + +
+
+ comment: Accepted to BlackboxNLP (EMNLP 2023) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Dataset Distillation in Large Data Era + + +
+ Dataset distillation aims to generate a smaller but representative subset +from a large dataset, which allows a model to be trained efficiently, meanwhile +evaluating on the original testing data distribution to achieve decent +performance. Many prior works have aimed to align with diverse aspects of the +original datasets, such as matching the training weight trajectories, gradient, +feature/BatchNorm distributions, etc. In this work, we show how to distill +various large-scale datasets such as full ImageNet-1K/21K under a conventional +input resolution of 224$\times$224 to achieve the best accuracy over all +previous approaches, including SRe$^2$L, TESLA and MTT. To achieve this, we +introduce a simple yet effective ${\bf C}$urriculum ${\bf D}$ata ${\bf +A}$ugmentation ($\texttt{CDA}$) during data synthesis that obtains the accuracy +on large-scale ImageNet-1K and 21K with 63.2% under IPC (Images Per Class) 50 +and 36.1% under IPC 20, respectively. Finally, we show that, by integrating all +our enhancements together, the proposed model beats the current +state-of-the-art by more than 4% Top-1 accuracy on ImageNet-1K/21K and for the +first time, reduces the gap to its full-data training counterpart to less than +absolute 15%. Moreover, this work represents the inaugural success in dataset +distillation on larger-scale ImageNet-21K under the standard 224$\times$224 +resolution. Our code and distilled ImageNet-21K dataset of 20 IPC, 2K recovery +budget are available at https://github.com/VILA-Lab/SRe2L/tree/main/CDA. + +
+
+ comment: Code and distilled ImageNet-21K dataset are available at + https://github.com/VILA-Lab/SRe2L/tree/main/CDA +
+
+
+
+
+ + ☆ TrafficMOT: A Challenging Dataset for Multi-Object Tracking in Complex + Traffic Scenarios + + +
+ Multi-object tracking in traffic videos is a crucial research area, offering +immense potential for enhancing traffic monitoring accuracy and promoting road +safety measures through the utilisation of advanced machine learning +algorithms. However, existing datasets for multi-object tracking in traffic +videos often feature limited instances or focus on single classes, which cannot +well simulate the challenges encountered in complex traffic scenarios. To +address this gap, we introduce TrafficMOT, an extensive dataset designed to +encompass diverse traffic situations with complex scenarios. To validate the +complexity and challenges presented by TrafficMOT, we conducted comprehensive +empirical studies using three different settings: fully-supervised, +semi-supervised, and a recent powerful zero-shot foundation model Tracking +Anything Model (TAM). The experimental results highlight the inherent +complexity of this dataset, emphasising its value in driving advancements in +the field of traffic monitoring and multi-object tracking. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ Just Add $π$! Pose Induced Video Transformers for Understanding + Activities of Daily Living + + +
+ Video transformers have become the de facto standard for human action +recognition, yet their exclusive reliance on the RGB modality still limits +their adoption in certain domains. One such domain is Activities of Daily +Living (ADL), where RGB alone is not sufficient to distinguish between visually +similar actions, or actions observed from multiple viewpoints. To facilitate +the adoption of video transformers for ADL, we hypothesize that the +augmentation of RGB with human pose information, known for its sensitivity to +fine-grained motion and multiple viewpoints, is essential. Consequently, we +introduce the first Pose Induced Video Transformer: PI-ViT (or $\pi$-ViT), a +novel approach that augments the RGB representations learned by video +transformers with 2D and 3D pose information. The key elements of $\pi$-ViT are +two plug-in modules, 2D Skeleton Induction Module and 3D Skeleton Induction +Module, that are responsible for inducing 2D and 3D pose information into the +RGB representations. These modules operate by performing pose-aware auxiliary +tasks, a design choice that allows $\pi$-ViT to discard the modules during +inference. Notably, $\pi$-ViT achieves the state-of-the-art performance on +three prominent ADL datasets, encompassing both real-world and large-scale +RGB-D datasets, without requiring poses or additional computational overhead at +inference. + +
+
+ comment: Code and models will be released at: + https://github.com/dominickrei/pi-vit +
+
+
+
+
+ + ☆ PoseGPT: Chatting about 3D Human Pose + + +
+ We introduce PoseGPT, a framework employing Large Language Models (LLMs) to +understand and reason about 3D human poses from images or textual descriptions. +Our work is motivated by the human ability to intuitively understand postures +from a single image or a brief description, a process that intertwines image +interpretation, world knowledge, and an understanding of body language. +Traditional human pose estimation methods, whether image-based or text-based, +often lack holistic scene comprehension and nuanced reasoning, leading to a +disconnect between visual data and its real-world implications. PoseGPT +addresses these limitations by embedding SMPL poses as a distinct signal token +within a multi-modal LLM, enabling direct generation of 3D body poses from both +textual and visual inputs. This approach not only simplifies pose prediction +but also empowers LLMs to apply their world knowledge in reasoning about human +poses, fostering two advanced tasks: speculative pose generation and reasoning +about pose estimation. These tasks involve reasoning about humans to generate +3D poses from subtle text queries, possibly accompanied by images. We establish +benchmarks for these tasks, moving beyond traditional 3D pose generation and +estimation methods. Our results show that PoseGPT outperforms existing +multimodal LLMs and task-sepcific methods on these newly proposed tasks. +Furthermore, PoseGPT's ability to understand and generate 3D human poses based +on complex reasoning opens new directions in human pose analysis. + +
+
+ comment: Home page: https://yfeng95.github.io/posegpt +
+
+
+
+
+ + ☆ VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion + Models + + +
+ Diffusion models have achieved significant success in image and video +generation. This motivates a growing interest in video editing tasks, where +videos are edited according to provided text descriptions. However, most +existing approaches only focus on video editing for short clips and rely on +time-consuming tuning or inference. We are the first to propose Video +Instruction Diffusion (VIDiff), a unified foundation model designed for a wide +range of video tasks. These tasks encompass both understanding tasks (such as +language-guided video object segmentation) and generative tasks (video editing +and enhancement). Our model can edit and translate the desired results within +seconds based on user instructions. Moreover, we design an iterative +auto-regressive method to ensure consistency in editing and enhancing long +videos. We provide convincing generative results for diverse input videos and +written instructions, both qualitatively and quantitatively. More examples can +be found at our website https://ChenHsing.github.io/VIDiff. + +
+
+
+
+
+ + ☆ InstructSeq: Unifying Vision Tasks with Instruction-conditioned + Multi-modal Sequence Generation + + +
+ Empowering models to dynamically accomplish tasks specified through natural +language instructions represents a promising path toward more capable and +general artificial intelligence. In this work, we introduce InstructSeq, an +instruction-conditioned multi-modal modeling framework that unifies diverse +vision tasks through flexible natural language control and handling of both +visual and textual data. InstructSeq employs a multimodal transformer +architecture encompassing visual, language, and sequential modeling. We utilize +a visual encoder to extract image features and a text encoder to encode +instructions. An autoregressive transformer fuses the representations and +generates sequential task outputs. By training with LLM-generated natural +language instructions, InstructSeq acquires a strong comprehension of free-form +instructions for specifying visual tasks. This provides an intuitive interface +for directing capabilities using flexible natural instructions. Without any +task-specific tuning, InstructSeq achieves compelling performance on semantic +segmentation, referring expression segmentation/comprehension, and image +captioning. The flexible control and multi-task unification empower the model +with more human-like versatility and generalizability for computer vision. The +code will be released soon at https://github.com/rongyaofang/InstructSeq. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ ART$\boldsymbol{\cdot}$V: Auto-Regressive Text-to-Video Generation with + Diffusion Models + + +
+ We present ART$\boldsymbol{\cdot}$V, an efficient framework for +auto-regressive video generation with diffusion models. Unlike existing methods +that generate entire videos in one-shot, ART$\boldsymbol{\cdot}$V generates a +single frame at a time, conditioned on the previous ones. The framework offers +three distinct advantages. First, it only learns simple continual motions +between adjacent frames, therefore avoiding modeling complex long-range motions +that require huge training data. Second, it preserves the high-fidelity +generation ability of the pre-trained image diffusion models by making only +minimal network modifications. Third, it can generate arbitrarily long videos +conditioned on a variety of prompts such as text, image or their combinations, +making it highly versatile and flexible. To combat the common drifting issue in +AR models, we propose masked diffusion model which implicitly learns which +information can be drawn from reference images rather than network predictions, +in order to reduce the risk of generating inconsistent appearances that cause +drifting. Moreover, we further enhance generation coherence by conditioning it +on the initial frame, which typically contains minimal noise. This is +particularly useful for long video generation. When trained for only two weeks +on four GPUs, ART$\boldsymbol{\cdot}$V already can generate videos with natural +motions, rich details and a high level of aesthetic quality. Besides, it +enables various appealing applications, e.g., composing a long video from +multiple text prompts. + +
+
+ comment: 24 pages, 21 figures. Project page at + https://warranweng.github.io/art.v +
+
+
+
+
+ + ☆ Exploiting Diffusion Prior for Generalizable Pixel-Level Semantic + Prediction + + +
+ Contents generated by recent advanced Text-to-Image (T2I) diffusion models +are sometimes too imaginative for existing off-the-shelf property semantic +predictors to estimate due to the immitigable domain gap. We introduce DMP, a +pipeline utilizing pre-trained T2I models as a prior for pixel-level semantic +prediction tasks. To address the misalignment between deterministic prediction +tasks and stochastic T2I models, we reformulate the diffusion process through a +sequence of interpolations, establishing a deterministic mapping between input +RGB images and output prediction distributions. To preserve generalizability, +we use low-rank adaptation to fine-tune pre-trained models. Extensive +experiments across five tasks, including 3D property estimation, semantic +segmentation, and intrinsic image decomposition, showcase the efficacy of the +proposed method. Despite limited-domain training data, the approach yields +faithful estimations for arbitrary images, surpassing existing state-of-the-art +algorithms. + +
+
+ comment: Project page: https://shinying.github.io/dmp +
+
+
+
+
+ + ☆ MotionEditor: Editing Video Motion via Content-Aware Diffusion + + +
+ Existing diffusion-based video editing models have made gorgeous advances for +editing attributes of a source video over time but struggle to manipulate the +motion information while preserving the original protagonist's appearance and +background. To address this, we propose MotionEditor, a diffusion model for +video motion editing. MotionEditor incorporates a novel content-aware motion +adapter into ControlNet to capture temporal motion correspondence. While +ControlNet enables direct generation based on skeleton poses, it encounters +challenges when modifying the source motion in the inverted noise due to +contradictory signals between the noise (source) and the condition (reference). +Our adapter complements ControlNet by involving source content to transfer +adapted control signals seamlessly. Further, we build up a two-branch +architecture (a reconstruction branch and an editing branch) with a +high-fidelity attention injection mechanism facilitating branch interaction. +This mechanism enables the editing branch to query the key and value from the +reconstruction branch in a decoupled manner, making the editing branch retain +the original background and protagonist appearance. We also propose a skeleton +alignment algorithm to address the discrepancies in pose size and position. +Experiments demonstrate the promising motion editing ability of MotionEditor, +both qualitatively and quantitatively. + +
+
+ comment: 18 pages, 15 figures. Project page at + https://francis-rings.github.io/MotionEditor/ +
+
+
+
+
+ + ☆ MicroCinema: A Divide-and-Conquer Approach for Text-to-Video Generation + + +
+ We present MicroCinema, a straightforward yet effective framework for +high-quality and coherent text-to-video generation. Unlike existing approaches +that align text prompts with video directly, MicroCinema introduces a +Divide-and-Conquer strategy which divides the text-to-video into a two-stage +process: text-to-image generation and image\&text-to-video generation. This +strategy offers two significant advantages. a) It allows us to take full +advantage of the recent advances in text-to-image models, such as Stable +Diffusion, Midjourney, and DALLE, to generate photorealistic and highly +detailed images. b) Leveraging the generated image, the model can allocate less +focus to fine-grained appearance details, prioritizing the efficient learning +of motion dynamics. To implement this strategy effectively, we introduce two +core designs. First, we propose the Appearance Injection Network, enhancing the +preservation of the appearance of the given image. Second, we introduce the +Appearance Noise Prior, a novel mechanism aimed at maintaining the capabilities +of pre-trained 2D diffusion models. These design elements empower MicroCinema +to generate high-quality videos with precise motion, guided by the provided +text prompts. Extensive experiments demonstrate the superiority of the proposed +framework. Concretely, MicroCinema achieves SOTA zero-shot FVD of 342.86 on +UCF-101 and 377.40 on MSR-VTT. See +https://wangyanhui666.github.io/MicroCinema.github.io/ for video samples. + +
+
+ comment: Project page: https://wangyanhui666.github.io/MicroCinema.github.io/ +
+
+
+
+
+ + ☆ One-step Diffusion with Distribution Matching Distillation + + +
+ Diffusion models generate high-quality images but require dozens of forward +passes. We introduce Distribution Matching Distillation (DMD), a procedure to +transform a diffusion model into a one-step image generator with minimal impact +on image quality. We enforce the one-step image generator match the diffusion +model at distribution level, by minimizing an approximate KL divergence whose +gradient can be expressed as the difference between 2 score functions, one of +the target distribution and the other of the synthetic distribution being +produced by our one-step generator. The score functions are parameterized as +two diffusion models trained separately on each distribution. Combined with a +simple regression loss matching the large-scale structure of the multi-step +diffusion outputs, our method outperforms all published few-step diffusion +approaches, reaching 2.62 FID on ImageNet 64x64 and 11.49 FID on zero-shot +COCO-30k, comparable to Stable Diffusion but orders of magnitude faster. +Utilizing FP16 inference, our model can generate images at 20 FPS on modern +hardware. + +
+
+ comment: Project page: https://tianweiy.github.io/dmd/ +
+
+
+
+
+ + ☆ Motion-Conditioned Image Animation for Video Editing + + +
+ We introduce MoCA, a Motion-Conditioned Image Animation approach for video +editing. It leverages a simple decomposition of the video editing problem into +image editing followed by motion-conditioned image animation. Furthermore, +given the lack of robust evaluation datasets for video editing, we introduce a +new benchmark that measures edit capability across a wide variety of tasks, +such as object replacement, background changes, style changes, and motion +edits. We present a comprehensive human evaluation of the latest video editing +methods along with MoCA, on our proposed benchmark. MoCA establishes a new +state-of-the-art, demonstrating greater human preference win-rate, and +outperforming notable recent approaches including Dreamix (63%), MasaCtrl +(75%), and Tune-A-Video (72%), with especially significant improvements for +motion edits. + +
+
+ comment: Project page: https://facebookresearch.github.io/MoCA +
+
+
+
+
+ + ☆ CAST: Cross-Attention in Space and Time for Video Action Recognition NeurIPS 2023 + + +
+ Recognizing human actions in videos requires spatial and temporal +understanding. Most existing action recognition models lack a balanced +spatio-temporal understanding of videos. In this work, we propose a novel +two-stream architecture, called Cross-Attention in Space and Time (CAST), that +achieves a balanced spatio-temporal understanding of videos using only RGB +input. Our proposed bottleneck cross-attention mechanism enables the spatial +and temporal expert models to exchange information and make synergistic +predictions, leading to improved performance. We validate the proposed method +with extensive experiments on public benchmarks with different characteristics: +EPIC-KITCHENS-100, Something-Something-V2, and Kinetics-400. Our method +consistently shows favorable performance across these datasets, while the +performance of existing methods fluctuates depending on the dataset +characteristics. + +
+
+ comment: This is an accepted NeurIPS 2023. Project webpage is available at + https://jong980812.github.io/CAST.github.io/ Code is available at + https://github.com/KHU-VLL/CAST +
+
+
+
+
+ + ☆ Initializing Models with Larger Ones + + +
+ Weight initialization plays an important role in neural network training. +Widely used initialization methods are proposed and evaluated for networks that +are trained from scratch. However, the growing number of pretrained models now +offers new opportunities for tackling this classical problem of weight +initialization. In this work, we introduce weight selection, a method for +initializing smaller models by selecting a subset of weights from a pretrained +larger model. This enables the transfer of knowledge from pretrained weights to +smaller models. Our experiments demonstrate that weight selection can +significantly enhance the performance of small models and reduce their training +time. Notably, it can also be used together with knowledge distillation. Weight +selection offers a new approach to leverage the power of pretrained models in +resource-constrained settings, and we hope it can be a useful tool for training +small models in the large-model era. Code is available at +https://github.com/OscarXZQ/weight-selection. + +
+
+
+
+
+ + ☆ ElasticDiffusion: Training-free Arbitrary Size Image Generation + + +
+ Diffusion models have revolutionized image generation in recent years, yet +they are still limited to a few sizes and aspect ratios. We propose +ElasticDiffusion, a novel training-free decoding method that enables pretrained +text-to-image diffusion models to generate images with various sizes. +ElasticDiffusion attempts to decouple the generation trajectory of a pretrained +model into local and global signals. The local signal controls low-level pixel +information and can be estimated on local patches, while the global signal is +used to maintain overall structural consistency and is estimated with a +reference image. We test our method on CelebA-HQ (faces) and LAION-COCO +(objects/indoor/outdoor scenes). Our experiments and qualitative results show +superior image coherence quality across aspect ratios compared to +MultiDiffusion and the standard decoding strategy of Stable Diffusion. Code: +https://github.com/MoayedHajiAli/ElasticDiffusion-official.git + +
+
+
+
+
+ + ☆ IMMA: Immunizing text-to-image Models against Malicious Adaptation + + +
+ Advancements in text-to-image models and fine-tuning methods have led to the +increasing risk of malicious adaptation, i.e., fine-tuning to generate harmful +unauthorized content. Recent works, e.g., Glaze or MIST, have developed +data-poisoning techniques which protect the data against adaptation methods. In +this work, we consider an alternative paradigm for protection. We propose to +``immunize'' the model by learning model parameters that are difficult for the +adaptation methods when fine-tuning malicious content; in short IMMA. Empirical +results show IMMA's effectiveness against malicious adaptations, including +mimicking the artistic style and learning of inappropriate/unauthorized +content, over three adaptation methods: LoRA, Textual-Inversion, and +DreamBooth. + +
+
+
+
+
+ + ☆ Is Underwater Image Enhancement All Object Detectors Need? + + +
+ Underwater object detection is a crucial and challenging problem in marine +engineering and aquatic robot. The difficulty is partly because of the +degradation of underwater images caused by light selective absorption and +scattering. Intuitively, enhancing underwater images can benefit high-level +applications like underwater object detection. However, it is still unclear +whether all object detectors need underwater image enhancement as +pre-processing. We therefore pose the questions "Does underwater image +enhancement really improve underwater object detection?" and "How does +underwater image enhancement contribute to underwater object detection?". With +these two questions, we conduct extensive studies. Specifically, we use 18 +state-of-the-art underwater image enhancement algorithms, covering traditional, +CNN-based, and GAN-based algorithms, to pre-process underwater object detection +data. Then, we retrain 7 popular deep learning-based object detectors using the +corresponding results enhanced by different algorithms, obtaining 126 +underwater object detection models. Coupled with 7 object detection models +retrained using raw underwater images, we employ these 133 models to +comprehensively analyze the effect of underwater image enhancement on +underwater object detection. We expect this study can provide sufficient +exploration to answer the aforementioned questions and draw more attention of +the community to the joint problem of underwater image enhancement and +underwater object detection. The pre-trained models and results are publicly +available and will be regularly updated. Project page: +https://github.com/BIGWangYuDong/lqit/tree/main/configs/detection/uw_enhancement_affect_detection. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Convergence of Nonconvex PnP-ADMM with MMSE Denoisers + + +
+ Plug-and-Play Alternating Direction Method of Multipliers (PnP-ADMM) is a +widely-used algorithm for solving inverse problems by integrating physical +measurement models and convolutional neural network (CNN) priors. PnP-ADMM has +been theoretically proven to converge for convex data-fidelity terms and +nonexpansive CNNs. It has however been observed that PnP-ADMM often empirically +converges even for expansive CNNs. This paper presents a theoretical +explanation for the observed stability of PnP-ADMM based on the interpretation +of the CNN prior as a minimum mean-squared error (MMSE) denoiser. Our +explanation parallels a similar argument recently made for the iterative +shrinkage/thresholding algorithm variant of PnP (PnP-ISTA) and relies on the +connection between MMSE denoisers and proximal operators. We also numerically +evaluate the performance gap between PnP-ADMM using a nonexpansive DnCNN +denoiser and expansive DRUNet denoiser, thus motivating the use of expansive +CNNs. + +
+
+
+
+
+ + ☆ FoundPose: Unseen Object Pose Estimation with Foundation Features + + +
+ We propose FoundPose, a method for 6D pose estimation of unseen rigid objects +from a single RGB image. The method assumes that 3D models of the objects are +available but does not require any object-specific training. This is achieved +by building upon DINOv2, a recent vision foundation model with impressive +generalization capabilities. An online pose estimation stage is supported by a +minimal object representation that is built during a short onboarding stage +from DINOv2 patch features extracted from rendered object templates. Given a +query image with an object segmentation mask, FoundPose first rapidly retrieves +a handful of similarly looking templates by a DINOv2-based bag-of-words +approach. Pose hypotheses are then generated from 2D-3D correspondences +established by matching DINOv2 patch features between the query image and a +retrieved template, and finally optimized by featuremetric refinement. The +method can handle diverse objects, including challenging ones with symmetries +and without any texture, and noticeably outperforms existing RGB methods for +coarse pose estimation in both accuracy and speed on the standard BOP +benchmark. With the featuremetric and additional MegaPose refinement, which are +demonstrated complementary, the method outperforms all RGB competitors. Source +code is at: evinpinar.github.io/foundpose. + +
+
+
+
+
+ + ☆ BIOCLIP: A Vision Foundation Model for the Tree of Life + + +
+ Images of the natural world, collected by a variety of cameras, from drones +to individual phones, are increasingly abundant sources of biological +information. There is an explosion of computational methods and tools, +particularly computer vision, for extracting biologically relevant information +from images for science and conservation. Yet most of these are bespoke +approaches designed for a specific task and are not easily adaptable or +extendable to new questions, contexts, and datasets. A vision model for general +organismal biology questions on images is of timely need. To approach this, we +curate and release TreeOfLife-10M, the largest and most diverse ML-ready +dataset of biology images. We then develop BioCLIP, a foundation model for the +tree of life, leveraging the unique properties of biology captured by +TreeOfLife-10M, namely the abundance and variety of images of plants, animals, +and fungi, together with the availability of rich structured biological +knowledge. We rigorously benchmark our approach on diverse fine-grained biology +classification tasks, and find that BioCLIP consistently and substantially +outperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation +reveals that BioCLIP has learned a hierarchical representation conforming to +the tree of life, shedding light on its strong generalizability. Our code, +models and data will be made available at +https://github.com/Imageomics/bioclip. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Distributed Global Structure-from-Motion with a Deep Front-End + + +
+ While initial approaches to Structure-from-Motion (SfM) revolved around both +global and incremental methods, most recent applications rely on incremental +systems to estimate camera poses due to their superior robustness. Though there +has been tremendous progress in SfM `front-ends' powered by deep models learned +from data, the state-of-the-art (incremental) SfM pipelines still rely on +classical SIFT features, developed in 2004. In this work, we investigate +whether leveraging the developments in feature extraction and matching helps +global SfM perform on par with the SOTA incremental SfM approach (COLMAP). To +do so, we design a modular SfM framework that allows us to easily combine +developments in different stages of the SfM pipeline. Our experiments show that +while developments in deep-learning based two-view correspondence estimation do +translate to improvements in point density for scenes reconstructed with global +SfM, none of them outperform SIFT when comparing with incremental SfM results +on a range of datasets. Our SfM system is designed from the ground up to +leverage distributed computation, enabling us to parallelize computation on +multiple machines and scale to large scenes. + +
+
+
+
+
+ + ☆ X-InstructBLIP: A Framework for aligning X-Modal instruction-aware + representations to LLMs and Emergent Cross-modal Reasoning + + +
+ Vision-language pre-training and instruction tuning have demonstrated +general-purpose capabilities in 2D visual reasoning tasks by aligning visual +encoders with state-of-the-art large language models (LLMs). In this paper, we +introduce a simple, yet effective, cross-modality framework built atop frozen +LLMs that allows the integration of various modalities without extensive +modality-specific customization. To facilitate instruction-modality +fine-tuning, we collect high-quality instruction tuning data in an automatic +and scalable manner, composed of 24K QA samples for audio and 250K QA samples +for 3D. Leveraging instruction-aware representations, our model performs +comparably with leading-edge counterparts without the need of extensive +modality-specific pre-training or customization. Furthermore, our approach +demonstrates cross-modal reasoning abilities across two or more input +modalities, despite each modality projection being trained individually. To +study the model's cross-modal abilities, we contribute a novel Discriminative +Cross-modal Reasoning (DisCRn) evaluation task, comprising 9K audio-video QA +samples and 28K image-3D QA samples that require the model to reason +discriminatively across disparate input modalities. + +
+
+
+
+
+ + ☆ Automated interpretation of congenital heart disease from multi-view + echocardiograms + + +
+ Congenital heart disease (CHD) is the most common birth defect and the +leading cause of neonate death in China. Clinical diagnosis can be based on the +selected 2D key-frames from five views. Limited by the availability of +multi-view data, most methods have to rely on the insufficient single view +analysis. This study proposes to automatically analyze the multi-view +echocardiograms with a practical end-to-end framework. We collect the five-view +echocardiograms video records of 1308 subjects (including normal controls, +ventricular septal defect (VSD) patients and atrial septal defect (ASD) +patients) with both disease labels and standard-view key-frame labels. +Depthwise separable convolution-based multi-channel networks are adopted to +largely reduce the network parameters. We also approach the imbalanced class +problem by augmenting the positive training samples. Our 2D key-frame model can +diagnose CHD or negative samples with an accuracy of 95.4\%, and in negative, +VSD or ASD classification with an accuracy of 92.3\%. To further alleviate the +work of key-frame selection in real-world implementation, we propose an +adaptive soft attention scheme to directly explore the raw video data. Four +kinds of neural aggregation methods are systematically investigated to fuse the +information of an arbitrary number of frames in a video. Moreover, with a view +detection module, the system can work without the view records. Our video-based +model can diagnose with an accuracy of 93.9\% (binary classification), and +92.1\% (3-class classification) in a collected 2D video testing set, which does +not need key-frame selection and view annotation in testing. The detailed +ablation study and the interpretability analysis are provided. + +
+
+ comment: Published in Medical Image Analysis +
+
+
+
+
+ + ☆ CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation + + +
+ We present CoDi-2, a versatile and interactive Multimodal Large Language +Model (MLLM) that can follow complex multimodal interleaved instructions, +conduct in-context learning (ICL), reason, chat, edit, etc., in an any-to-any +input-output modality paradigm. By aligning modalities with language for both +encoding and generation, CoDi-2 empowers Large Language Models (LLMs) to not +only understand complex modality-interleaved instructions and in-context +examples, but also autoregressively generate grounded and coherent multimodal +outputs in the continuous feature space. To train CoDi-2, we build a +large-scale generation dataset encompassing in-context multimodal instructions +across text, vision, and audio. CoDi-2 demonstrates a wide range of zero-shot +capabilities for multimodal generation, such as in-context learning, reasoning, +and compositionality of any-to-any modality generation through multi-round +interactive conversation. CoDi-2 surpasses previous domain-specific models on +tasks such as subject-driven image generation, vision transformation, and audio +editing. CoDi-2 signifies a substantial breakthrough in developing a +comprehensive multimodal foundation model adept at interpreting in-context +language-vision-audio interleaved instructions and producing multimodal +outputs. + +
+
+ comment: Project Page: https://codi-2.github.io/ +
+
+
+
+
+ + ☆ Spacewalk-18: A Benchmark for Multimodal and Long-form Procedural Video + Understanding in Novel Domains + + +
+ Learning from videos is an emerging research area that enables robots to +acquire skills from human demonstrations, such as procedural videos. To do +this, video-language models must be able to obtain structured understandings, +such as the temporal segmentation of a demonstration into sequences of actions +and skills, and to generalize the understandings to novel domains. In pursuit +of this goal, we introduce Spacewalk-18, a benchmark containing two tasks: (1) +step recognition and (2) intra-video retrieval over a dataset of temporally +segmented and labeled tasks in International Space Station spacewalk +recordings. In tandem, the two tasks quantify a model's ability to make use of: +(1) out-of-domain visual information; (2) a high temporal context window; and +(3) multimodal (text + video) domains. This departs from existing benchmarks +for procedural video understanding, which typically deal with short context +lengths and can be solved with a single modality. Spacewalk-18, with its +inherent multimodal and long-form complexity, exposes the high difficulty of +task recognition and segmentation. We find that state-of-the-art methods +perform poorly on our benchmark, demonstrating that the goal of generalizable +procedural video understanding models is far out and underscoring the need to +develop new approaches to these tasks. Data, model, and code will be publicly +released. + +
+
+ comment: Under submission. Code and models will be released at + https://brown-palm.github.io/Spacewalk-18/ +
+
+
+
+
+ + ☆ MLLMs-Augmented Visual-Language Representation Learning + + +
+ Visual-language pre-training (VLP) have achieved remarkable success in +multi-modal tasks, largely attributed to the availability of large-scale +image-text datasets. In this work, we demonstrate that multi-modal large +language models (MLLMs) can enhance visual-language representation learning by +improving data quality. Our approach is simple, utilizing MLLMs to extend +multiple captions for each image. To prevent the bias that introduced by MLLMs' +hallucinations and intrinsic caption styles, we propose a "text shearing" to +keep the lengths of extended captions identical to the originals. In image-text +retrieval, our method consistently obtains 5.6 ~ 35.0% and 16.8 ~ 46.1% +improvement on R@1 under the fine-tuning and zero-shot settings, respectively. +Notably, our zero-shot results are comparable to fine-tuning on target +datasets, which encourages more exploration on the versatile use of MLLMs. + +
+
+
+
+
+ + ☆ Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters + + +
+ Recent work has demonstrated a remarkable ability to customize text-to-image +diffusion models to multiple, fine-grained concepts in a sequential (i.e., +continual) manner while only providing a few example images for each concept. +This setting is known as continual diffusion. Here, we ask the question: Can we +scale these methods to longer concept sequences without forgetting? Although +prior work mitigates the forgetting of previously learned concepts, we show +that its capacity to learn new tasks reaches saturation over longer sequences. +We address this challenge by introducing a novel method, STack-And-Mask +INcremental Adapters (STAMINA), which is composed of low-ranked +attention-masked adapters and customized MLP tokens. STAMINA is designed to +enhance the robust fine-tuning properties of LoRA for sequential concept +learning via learnable hard-attention masks parameterized with low rank MLPs, +enabling precise, scalable learning via sparse adaptation. Notably, all +introduced trainable parameters can be folded back into the model after +training, inducing no additional inference parameter costs. We show that +STAMINA outperforms the prior SOTA for the setting of text-to-image continual +customization on a 50-concept benchmark composed of landmarks and human faces, +with no stored replay data. Additionally, we extended our method to the setting +of continual learning for image classification, demonstrating that our gains +also translate to state-of-the-art performance in this standard benchmark. + +
+
+
+
+
+ + ☆ Semi-supervised Semantic Segmentation via Boosting Uncertainty on + Unlabeled Data + + +
+ We bring a new perspective to semi-supervised semantic segmentation by +providing an analysis on the labeled and unlabeled distributions in training +datasets. We first figure out that the distribution gap between labeled and +unlabeled datasets cannot be ignored, even though the two datasets are sampled +from the same distribution. To address this issue, we theoretically analyze and +experimentally prove that appropriately boosting uncertainty on unlabeled data +can help minimize the distribution gap, which benefits the generalization of +the model. We propose two strategies and design an uncertainty booster +algorithm, specially for semi-supervised semantic segmentation. Extensive +experiments are carried out based on these theories, and the results confirm +the efficacy of the algorithm and strategies. Our plug-and-play uncertainty +booster is tiny, efficient, and robust to hyperparameters but can significantly +promote performance. Our approach achieves state-of-the-art performance in our +experiments compared to the current semi-supervised semantic segmentation +methods on the popular benchmarks: Cityscapes and PASCAL VOC 2012 with +different train settings. + +
+
+
+
+
+ + ☆ Learning One-Shot 4D Head Avatar Synthesis using Synthetic Data + + +
+ Existing one-shot 4D head synthesis methods usually learn from monocular +videos with the aid of 3DMM reconstruction, yet the latter is evenly +challenging which restricts them from reasonable 4D head synthesis. We present +a method to learn one-shot 4D head synthesis via large-scale synthetic data. +The key is to first learn a part-wise 4D generative model from monocular images +via adversarial learning, to synthesize multi-view images of diverse identities +and full motions as training data; then leverage a transformer-based animatable +triplane reconstructor to learn 4D head reconstruction using the synthetic +data. A novel learning strategy is enforced to enhance the generalizability to +real images by disentangling the learning process of 3D reconstruction and +reenactment. Experiments demonstrate our superiority over the prior art. + +
+
+ comment: Project page: https://yudeng.github.io/Portrait4D/ +
+
+
+
+
+ + ☆ Meta-Prior: Meta learning for Adaptive Inverse Problem Solvers + + +
+ Deep neural networks have become a foundational tool for addressing imaging +inverse problems. They are typically trained for a specific task, with a +supervised loss to learn a mapping from the observations to the image to +recover. However, real-world imaging challenges often lack ground truth data, +rendering traditional supervised approaches ineffective. Moreover, for each new +imaging task, a new model needs to be trained from scratch, wasting time and +resources. To overcome these limitations, we introduce a novel approach based +on meta-learning. Our method trains a meta-model on a diverse set of imaging +tasks that allows the model to be efficiently fine-tuned for specific tasks +with few fine-tuning steps. We show that the proposed method extends to the +unsupervised setting, where no ground truth data is available. In its bilevel +formulation, the outer level uses a supervised loss, that evaluates how well +the fine-tuned model performs, while the inner loss can be either supervised or +unsupervised, relying only on the measurement operator. This allows the +meta-model to leverage a few ground truth samples for each task while being +able to generalize to new imaging tasks. We show that in simple settings, this +approach recovers the Bayes optimal estimator, illustrating the soundness of +our approach. We also demonstrate our method's effectiveness on various tasks, +including image processing and magnetic resonance imaging. + +
+
+
+
+
+ + ☆ Seg2Reg: Differentiable 2D Segmentation to 1D Regression Rendering for + 360 Room Layout Reconstruction + + +
+ State-of-the-art single-view 360-degree room layout reconstruction methods +formulate the problem as a high-level 1D (per-column) regression task. On the +other hand, traditional low-level 2D layout segmentation is simpler to learn +and can represent occluded regions, but it requires complex post-processing for +the targeting layout polygon and sacrifices accuracy. We present Seg2Reg to +render 1D layout depth regression from the 2D segmentation map in a +differentiable and occlusion-aware way, marrying the merits of both sides. +Specifically, our model predicts floor-plan density for the input +equirectangular 360-degree image. Formulating the 2D layout representation as a +density field enables us to employ `flattened' volume rendering to form 1D +layout depth regression. In addition, we propose a novel 3D warping +augmentation on layout to improve generalization. Finally, we re-implement +recent room layout reconstruction methods into our codebase for benchmarking +and explore modern backbones and training techniques to serve as the strong +baseline. Our model significantly outperforms previous arts. The code will be +made available upon publication. + +
+
+
+
+
+ + ☆ RaDialog: A Large Vision-Language Model for Radiology Report Generation + and Conversational Assistance + + +
+ Conversational AI tools that can generate and discuss clinically correct +radiology reports for a given medical image have the potential to transform +radiology. Such a human-in-the-loop radiology assistant could facilitate a +collaborative diagnostic process, thus saving time and improving the quality of +reports. Towards this goal, we introduce RaDialog, the first thoroughly +evaluated and publicly available large vision-language model for radiology +report generation and interactive dialog. RaDialog effectively integrates +visual image features and structured pathology findings with a large language +model (LLM) while simultaneously adapting it to a specialized domain using +parameter-efficient fine-tuning. To keep the conversational abilities of the +underlying LLM, we propose a comprehensive, semi-automatically labeled, +image-grounded instruct dataset for chest X-ray radiology tasks. By training +with this dataset, our method achieves state-of-the-art clinical correctness in +report generation and shows impressive abilities in interactive tasks such as +correcting reports and answering questions, serving as a foundational step +toward clinical dialog systems. Our code is available on github: +https://github.com/ChantalMP/RaDialog. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Cascaded Interaction with Eroded Deep Supervision for Salient Object + Detection + + +
+ Deep convolutional neural networks have been widely applied in salient object +detection and have achieved remarkable results in this field. However, existing +models suffer from information distortion caused by interpolation during +up-sampling and down-sampling. In response to this drawback, this article +starts from two directions in the network: feature and label. On the one hand, +a novel cascaded interaction network with a guidance module named global-local +aligned attention (GAA) is designed to reduce the negative impact of +interpolation on the feature side. On the other hand, a deep supervision +strategy based on edge erosion is proposed to reduce the negative guidance of +label interpolation on lateral output. Extensive experiments on five popular +datasets demonstrate the superiority of our method. + +
+
+
+
+
+ + ☆ Action Recognition in Video Recordings from Gynecologic Laparoscopy + + +
+ Action recognition is a prerequisite for many applications in laparoscopic +video analysis including but not limited to surgical training, operation room +planning, follow-up surgery preparation, post-operative surgical assessment, +and surgical outcome estimation. However, automatic action recognition in +laparoscopic surgeries involves numerous challenges such as (I) cross-action +and intra-action duration variation, (II) relevant content distortion due to +smoke, blood accumulation, fast camera motions, organ movements, object +occlusion, and (III) surgical scene variations due to different illuminations +and viewpoints. Besides, action annotations in laparoscopy surgeries are +limited and expensive due to requiring expert knowledge. In this study, we +design and evaluate a CNN-RNN architecture as well as a customized +training-inference framework to deal with the mentioned challenges in +laparoscopic surgery action recognition. Using stacked recurrent layers, our +proposed network takes advantage of inter-frame dependencies to negate the +negative effect of content distortion and variation in action recognition. +Furthermore, our proposed frame sampling strategy effectively manages the +duration variations in surgical actions to enable action recognition with high +temporal resolution. Our extensive experiments confirm the superiority of our +proposed method in action recognition compared to static CNNs. + +
+
+
+
+
+ + ☆ Pose Estimation and Tracking for ASIST + + +
+ Aircraft Ship Integrated Secure and Traverse (ASIST) is a system designed to +arrest helicopters safely and efficiently on ships. Originally, a precision +Helicopter Position Sensing Equipment (HPSE) tracked and monitored the position +of the helicopter relative to the Rapid Securing Device (RSD). However, using +the HPSE component was determined to be infeasible in the transition of the +ASIST system due to the hardware installation requirements. As a result, +sailors track the position of the helicopters with their eyes with no sensor or +artificially intelligent decision aid. Manually tracking the helicopter takes +additional time and makes recoveries more difficult, especially at high sea +states. Performing recoveries without the decision aid leads to higher +uncertainty and cognitive load. PETA (Pose Estimation and Tracking for ASIST) +is a research effort to create a helicopter tracking system prototype without +hardware installation requirements for ASIST system operators. Its overall goal +is to improve situational awareness and reduce operator uncertainty with +respect to the aircrafts position relative to the RSD, and consequently +increase the allowable landing area. The authors produced a prototype system +capable of tracking helicopters with respect to the RSD. The software included +a helicopter pose estimation component, camera pose estimation component, and a +user interface component. PETA demonstrated the potential for state-of-the-art +computer vision algorithms Faster R-CNN and HRNet (High-Resolution Network) to +be used to estimate the pose of helicopters in real-time, returning ASIST to +its originally intended capability. PETA also demonstrated that traditional +methods of encoder-decoders could be used to estimate the orientation of the +helicopter and could be used to confirm the output from HRNet. + +
+
+ comment: 7 pages, 8 figures. Published in the Proceedings of the ASNE 2023 + Technology, Systems & Ships Symposium. Reproduced with permission from the + American Society of Naval Engineers. Distribution Statement A: Approved for + public release; distribution is unlimited, as submitted under NAVAIR Public + Release Authorization 2023-018 +
+
+
+
+
+ + ☆ Multi-task learning with cross-task consistency for improved depth + estimation in colonoscopy + + +
+ Colonoscopy screening is the gold standard procedure for assessing +abnormalities in the colon and rectum, such as ulcers and cancerous polyps. +Measuring the abnormal mucosal area and its 3D reconstruction can help quantify +the surveyed area and objectively evaluate disease burden. However, due to the +complex topology of these organs and variable physical conditions, for example, +lighting, large homogeneous texture, and image modality estimating distance +from the camera aka depth) is highly challenging. Moreover, most colonoscopic +video acquisition is monocular, making the depth estimation a non-trivial +problem. While methods in computer vision for depth estimation have been +proposed and advanced on natural scene datasets, the efficacy of these +techniques has not been widely quantified on colonoscopy datasets. As the +colonic mucosa has several low-texture regions that are not well pronounced, +learning representations from an auxiliary task can improve salient feature +extraction, allowing estimation of accurate camera depths. In this work, we +propose to develop a novel multi-task learning (MTL) approach with a shared +encoder and two decoders, namely a surface normal decoder and a depth estimator +decoder. Our depth estimator incorporates attention mechanisms to enhance +global context awareness. We leverage the surface normal prediction to improve +geometric feature extraction. Also, we apply a cross-task consistency loss +among the two geometrically related tasks, surface normal and camera depth. We +demonstrate an improvement of 14.17% on relative error and 10.4% improvement on +$\delta_{1}$ accuracy over the most accurate baseline state-of-the-art BTS +approach. All experiments are conducted on a recently released C3VD dataset; +thus, we provide a first benchmark of state-of-the-art methods. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Learning Part Segmentation from Synthetic Animals + + +
+ Semantic part segmentation provides an intricate and interpretable +understanding of an object, thereby benefiting numerous downstream tasks. +However, the need for exhaustive annotations impedes its usage across diverse +object types. This paper focuses on learning part segmentation from synthetic +animals, leveraging the Skinned Multi-Animal Linear (SMAL) models to scale up +existing synthetic data generated by computer-aided design (CAD) animal models. +Compared to CAD models, SMAL models generate data with a wider range of poses +observed in real-world scenarios. As a result, our first contribution is to +construct a synthetic animal dataset of tigers and horses with more pose +diversity, termed Synthetic Animal Parts (SAP). We then benchmark Syn-to-Real +animal part segmentation from SAP to PartImageNet, namely SynRealPart, with +existing semantic segmentation domain adaptation methods and further improve +them as our second contribution. Concretely, we examine three Syn-to-Real +adaptation methods but observe relative performance drop due to the innate +difference between the two tasks. To address this, we propose a simple yet +effective method called Class-Balanced Fourier Data Mixing (CB-FDM). Fourier +Data Mixing aligns the spectral amplitudes of synthetic images with real +images, thereby making the mixed images have more similar frequency content to +real images. We further use Class-Balanced Pseudo-Label Re-Weighting to +alleviate the imbalanced class distribution. We demonstrate the efficacy of +CB-FDM on SynRealPart over previous methods with significant performance +improvements. Remarkably, our third contribution is to reveal that the learned +parts from synthetic tiger and horse are transferable across all quadrupeds in +PartImageNet, further underscoring the utility and potential applications of +animal part segmentation. + +
+
+
+
+
+ + ☆ Detailed Human-Centric Text Description-Driven Large Scene Synthesis + + +
+ Text-driven large scene image synthesis has made significant progress with +diffusion models, but controlling it is challenging. While using additional +spatial controls with corresponding texts has improved the controllability of +large scene synthesis, it is still challenging to faithfully reflect detailed +text descriptions without user-provided controls. Here, we propose +DetText2Scene, a novel text-driven large-scale image synthesis with high +faithfulness, controllability, and naturalness in a global context for the +detailed human-centric text description. Our DetText2Scene consists of 1) +hierarchical keypoint-box layout generation from the detailed description by +leveraging large language model (LLM), 2) view-wise conditioned joint diffusion +process to synthesize a large scene from the given detailed text with +LLM-generated grounded keypoint-box layout and 3) pixel perturbation-based +pyramidal interpolation to progressively refine the large scene for global +coherence. Our DetText2Scene significantly outperforms prior arts in +text-to-large scene synthesis qualitatively and quantitatively, demonstrating +strong faithfulness with detailed descriptions, superior controllability, and +excellent naturalness in a global context. + +
+
+
+
+
+ + ☆ LL3DA: Visual Interactive Instruction Tuning for Omni-3D Understanding, + Reasoning, and Planning + + +
+ Recent advances in Large Multimodal Models (LMM) have made it possible for +various applications in human-machine interactions. However, developing LMMs +that can comprehend, reason, and plan in complex and diverse 3D environments +remains a challenging topic, especially considering the demand for +understanding permutation-invariant point cloud 3D representations of the 3D +scene. Existing works seek help from multi-view images, and project 2D features +to 3D space as 3D scene representations. This, however, leads to huge +computational overhead and performance degradation. In this paper, we present +LL3DA, a Large Language 3D Assistant that takes point cloud as direct input and +respond to both textual-instructions and visual-prompts. This help LMMs better +comprehend human interactions and further help to remove the ambiguities in +cluttered 3D scenes. Experiments show that LL3DA achieves remarkable results, +and surpasses various 3D vision-language models on both 3D Dense Captioning and +3D Question Answering. + +
+
+ comment: Project Page: https://ll3da.github.io/ +
+
+
+
+
+ + ☆ Simple Semantic-Aided Few-Shot Learning + + +
+ Learning from a limited amount of data, namely Few-Shot Learning, stands out +as a challenging computer vision task. Several works exploit semantics and +design complicated semantic fusion mechanisms to compensate for rare +representative features within restricted data. However, relying on naive +semantics such as class names introduces biases due to their brevity, while +acquiring extensive semantics from external knowledge takes a huge time and +effort. This limitation severely constrains the potential of semantics in +few-shot learning. In this paper, we design an automatic way called Semantic +Evolution to generate high-quality semantics. The incorporation of high-quality +semantics alleviates the need for complex network structures and learning +algorithms used in previous works. Hence, we employ a simple two-layer network +termed Semantic Alignment Network to transform semantics and visual features +into robust class prototypes with rich discriminative features for few-shot +classification. The experimental results show our framework outperforms all +previous methods on five benchmarks, demonstrating a simple network with +high-quality semantics can beat intricate multi-modal modules on few-shot +classification tasks. + +
+
+
+
+
+ + ☆ Stochastic Vision Transformers with Wasserstein Distance-Aware Attention + + +
+ Self-supervised learning is one of the most promising approaches to acquiring +knowledge from limited labeled data. Despite the substantial advancements made +in recent years, self-supervised models have posed a challenge to +practitioners, as they do not readily provide insight into the model's +confidence and uncertainty. Tackling this issue is no simple feat, primarily +due to the complexity involved in implementing techniques that can make use of +the latent representations learned during pre-training without relying on +explicit labels. Motivated by this, we introduce a new stochastic vision +transformer that integrates uncertainty and distance awareness into +self-supervised learning (SSL) pipelines. Instead of the conventional +deterministic vector embedding, our novel stochastic vision transformer encodes +image patches into elliptical Gaussian distributional embeddings. Notably, the +attention matrices of these stochastic representational embeddings are computed +using Wasserstein distance-based attention, effectively capitalizing on the +distributional nature of these embeddings. Additionally, we propose a +regularization term based on Wasserstein distance for both pre-training and +fine-tuning processes, thereby incorporating distance awareness into latent +representations. We perform extensive experiments across different tasks such +as in-distribution generalization, out-of-distribution detection, dataset +corruption, semi-supervised settings, and transfer learning to other datasets +and tasks. Our proposed method achieves superior accuracy and calibration, +surpassing the self-supervised baseline in a wide range of experiments on a +variety of datasets. + +
+
+
+
+
+ + ☆ DiffusionAvatars: Deferred Diffusion for High-fidelity 3D Head Avatars + + +
+ DiffusionAvatars synthesizes a high-fidelity 3D head avatar of a person, +offering intuitive control over both pose and expression. We propose a +diffusion-based neural renderer that leverages generic 2D priors to produce +compelling images of faces. For coarse guidance of the expression and head +pose, we render a neural parametric head model (NPHM) from the target +viewpoint, which acts as a proxy geometry of the person. Additionally, to +enhance the modeling of intricate facial expressions, we condition +DiffusionAvatars directly on the expression codes obtained from NPHM via +cross-attention. Finally, to synthesize consistent surface details across +different viewpoints and expressions, we rig learnable spatial features to the +head's surface via TriPlane lookup in NPHM's canonical space. We train +DiffusionAvatars on RGB videos and corresponding tracked NPHM meshes of a +person and test the obtained avatars in both self-reenactment and animation +scenarios. Our experiments demonstrate that DiffusionAvatars generates +temporally consistent and visually appealing videos for novel poses and +expressions of a person, outperforming existing approaches. + +
+
+ comment: Project Page: https://tobias-kirschstein.github.io/diffusion-avatars/ + , Video: https://youtu.be/nSjDiiTnp2E +
+
+
+
+
+ + ☆ A Lightweight Clustering Framework for Unsupervised Semantic + Segmentation + + +
+ Unsupervised semantic segmentation aims to label each pixel of an image to a +corresponding class without the use of annotated data. It is a widely +researched area as obtaining labeled datasets are expensive. While previous +works in the field demonstrated a gradual improvement in segmentation +performance, most of them required neural network training. This made +segmentation equally expensive, especially when dealing with large-scale +datasets. We thereby propose a lightweight clustering framework for +unsupervised semantic segmentation. Attention features of the self-supervised +vision transformer exhibit strong foreground-background differentiability. By +clustering these features into a small number of clusters, we could separate +foreground and background image patches into distinct groupings. In our +clustering framework, we first obtain attention features from the +self-supervised vision transformer. Then we extract Dataset-level, +Category-level and Image-level masks by clustering features within the same +dataset, category and image. We further ensure multilevel clustering +consistency across the three levels and this allows us to extract patch-level +binary pseudo-masks. Finally, the pseudo-mask is upsampled, refined and class +assignment is performed according to the CLS token of object regions. Our +framework demonstrates great promise in unsupervised semantic segmentation and +achieves state-of-the-art results on PASCAL VOC and MS COCO datasets. + +
+
+
+
+
+ + ☆ JPPF: Multi-task Fusion for Consistent Panoptic-Part Segmentation + + +
+ Part-aware panoptic segmentation is a problem of computer vision that aims to +provide a semantic understanding of the scene at multiple levels of +granularity. More precisely, semantic areas, object instances, and semantic +parts are predicted simultaneously. In this paper, we present our Joint +Panoptic Part Fusion (JPPF) that combines the three individual segmentations +effectively to obtain a panoptic-part segmentation. Two aspects are of utmost +importance for this: First, a unified model for the three problems is desired +that allows for mutually improved and consistent representation learning. +Second, balancing the combination so that it gives equal importance to all +individual results during fusion. Our proposed JPPF is parameter-free and +dynamically balances its input. The method is evaluated and compared on the +Cityscapes Panoptic Parts (CPP) and Pascal Panoptic Parts (PPP) datasets in +terms of PartPQ and Part-Whole Quality (PWQ). In extensive experiments, we +verify the importance of our fair fusion, highlight its most significant impact +for areas that can be further segmented into parts, and demonstrate the +generalization capabilities of our design without fine-tuning on 5 additional +datasets. + +
+
+ comment: Accepted for Springer Nature Computer Science. arXiv admin note: + substantial text overlap with arXiv:2212.07671 +
+
+
+
+
+ + ☆ Anatomy and Physiology of Artificial Intelligence in PET Imaging + + +
+ The influence of artificial intelligence (AI) within the field of nuclear +medicine has been rapidly growing. Many researchers and clinicians are seeking +to apply AI within PET, and clinicians will soon find themselves engaging with +AI-based applications all along the chain of molecular imaging, from image +reconstruction to enhanced reporting. This expanding presence of AI in PET +imaging will result in greater demand for educational resources for those +unfamiliar with AI. The objective of this article to is provide an illustrated +guide to the core principles of modern AI, with specific focus on aspects that +are most likely to be encountered in PET imaging. We describe convolutional +neural networks, algorithm training, and explain the components of the commonly +used U-Net for segmentation and image synthesis. + +
+
+
+
+
+ + ☆ Cancer-Net PCa-Gen: Synthesis of Realistic Prostate Diffusion Weighted + Imaging Data via Anatomic-Conditional Controlled Latent Diffusion + + +
+ In Canada, prostate cancer is the most common form of cancer in men and +accounted for 20% of new cancer cases for this demographic in 2022. Due to +recent successes in leveraging machine learning for clinical decision support, +there has been significant interest in the development of deep neural networks +for prostate cancer diagnosis, prognosis, and treatment planning using +diffusion weighted imaging (DWI) data. A major challenge hindering widespread +adoption in clinical use is poor generalization of such networks due to +scarcity of large-scale, diverse, balanced prostate imaging datasets for +training such networks. In this study, we explore the efficacy of latent +diffusion for generating realistic prostate DWI data through the introduction +of an anatomic-conditional controlled latent diffusion strategy. To the best of +the authors' knowledge, this is the first study to leverage conditioning for +synthesis of prostate cancer imaging. Experimental results show that the +proposed strategy, which we call Cancer-Net PCa-Gen, enhances synthesis of +diverse prostate images through controllable tumour locations and better +anatomical and textural fidelity. These crucial features make it well-suited +for augmenting real patient data, enabling neural networks to be trained on a +more diverse and comprehensive data distribution. The Cancer-Net PCa-Gen +framework and sample images have been made publicly available at +https://www.kaggle.com/datasets/deetsadi/cancer-net-pca-gen-dataset as a part +of a global open-source initiative dedicated to accelerating advancement in +machine learning to aid clinicians in the fight against cancer. + +
+
+
+
+
+ + ☆ DiffCAD: Weakly-Supervised Probabilistic CAD Model Retrieval and + Alignment from an RGB Image + + +
+ Perceiving 3D structures from RGB images based on CAD model primitives can +enable an effective, efficient 3D object-based representation of scenes. +However, current approaches rely on supervision from expensive annotations of +CAD models associated with real images, and encounter challenges due to the +inherent ambiguities in the task -- both in depth-scale ambiguity in monocular +perception, as well as inexact matches of CAD database models to real +observations. We thus propose DiffCAD, the first weakly-supervised +probabilistic approach to CAD retrieval and alignment from an RGB image. We +formulate this as a conditional generative task, leveraging diffusion to learn +implicit probabilistic models capturing the shape, pose, and scale of CAD +objects in an image. This enables multi-hypothesis generation of different +plausible CAD reconstructions, requiring only a few hypotheses to characterize +ambiguities in depth/scale and inexact shape matches. Our approach is trained +only on synthetic data, leveraging monocular depth and mask estimates to enable +robust zero-shot adaptation to various real target domains. Despite being +trained solely on synthetic data, our multi-hypothesis approach can even +surpass the supervised state-of-the-art on the Scan2CAD dataset by 5.9% with 8 +hypotheses. + +
+
+ comment: Project page: https://daoyig.github.io/DiffCAD/ Video: + https://www.youtube.com/watch?v=PCursyPosMY +
+
+
+
+
+ + ☆ Contrastive Denoising Score for Text-guided Latent Diffusion Image + Editing + + +
+ With the remarkable advent of text-to-image diffusion models, image editing +methods have become more diverse and continue to evolve. A promising recent +approach in this realm is Delta Denoising Score (DDS) - an image editing +technique based on Score Distillation Sampling (SDS) framework that leverages +the rich generative prior of text-to-image diffusion models. However, relying +solely on the difference between scoring functions is insufficient for +preserving specific structural elements from the original image, a crucial +aspect of image editing. Inspired by the similarity and importance differences +between DDS and the contrastive learning for unpaired image-to-image +translation (CUT), here we present an embarrassingly simple yet very powerful +modification of DDS, called Contrastive Denoising Score (CDS), for latent +diffusion models (LDM). Specifically, to enforce structural correspondence +between the input and output while maintaining the controllability of contents, +we introduce a straightforward approach to regulate structural consistency +using CUT loss within the DDS framework. To calculate this loss, instead of +employing auxiliary networks, we utilize the intermediate features of LDM, in +particular, those from the self-attention layers, which possesses rich spatial +information. Our approach enables zero-shot image-to-image translation and +neural radiance field (NeRF) editing, achieving a well-balanced interplay +between maintaining the structural details and transforming content. +Qualitative results and comparisons demonstrates the effectiveness of our +proposed method. Project page with code is available at +https://hyelinnam.github.io/CDS/. + +
+
+ comment: Project page: https://hyelinnam.github.io/CDS/ +
+
+
+
+
+ + ☆ Learning Triangular Distribution in Visual World + + +
+ Convolution neural network is successful in pervasive vision tasks, including +label distribution learning, which usually takes the form of learning an +injection from the non-linear visual features to the well-defined labels. +However, how the discrepancy between features is mapped to the label +discrepancy is ambient, and its correctness is not guaranteed.To address these +problems, we study the mathematical connection between feature and its label, +presenting a general and simple framework for label distribution learning. We +propose a so-called Triangular Distribution Transform (TDT) to build an +injective function between feature and label, guaranteeing that any symmetric +feature discrepancy linearly reflects the difference between labels. The +proposed TDT can be used as a plug-in in mainstream backbone networks to +address different label distribution learning tasks. Experiments on Facial Age +Recognition, Illumination Chromaticity Estimation, and Aesthetics assessment +show that TDT achieves on-par or better results than the prior arts. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Semantic-Aware Frame-Event Fusion based Pattern Recognition via Large + Vision-Language Models + + +
+ Pattern recognition through the fusion of RGB frames and Event streams has +emerged as a novel research area in recent years. Current methods typically +employ backbone networks to individually extract the features of RGB frames and +event streams, and subsequently fuse these features for pattern recognition. +However, we posit that these methods may suffer from key issues like sematic +gaps and small-scale backbone networks. In this study, we introduce a novel +pattern recognition framework that consolidates the semantic labels, RGB +frames, and event streams, leveraging pre-trained large-scale vision-language +models. Specifically, given the input RGB frames, event streams, and all the +predefined semantic labels, we employ a pre-trained large-scale vision model +(CLIP vision encoder) to extract the RGB and event features. To handle the +semantic labels, we initially convert them into language descriptions through +prompt engineering, and then obtain the semantic features using the pre-trained +large-scale language model (CLIP text encoder). Subsequently, we integrate the +RGB/Event features and semantic features using multimodal Transformer networks. +The resulting frame and event tokens are further amplified using self-attention +layers. Concurrently, we propose to enhance the interactions between text +tokens and RGB/Event tokens via cross-attention. Finally, we consolidate all +three modalities using self-attention and feed-forward layers for recognition. +Comprehensive experiments on the HARDVS and PokerEvent datasets fully +substantiate the efficacy of our proposed SAFE model. The source code will be +made available at https://github.com/Event-AHU/SAFE_LargeVLM. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ Communication-Efficient Heterogeneous Federated Learning with + Generalized Heavy-Ball Momentum + + +
+ Federated Learning (FL) is the state-of-the-art approach for learning from +decentralized data in privacy-constrained scenarios. As the current literature +reports, the main problems associated with FL refer to system and statistical +challenges: the former ones demand for efficient learning from edge devices, +including lowering communication bandwidth and frequency, while the latter +require algorithms robust to non-iidness. State-of-art approaches either +guarantee convergence at increased communication cost or are not sufficiently +robust to handle extreme heterogeneous local distributions. In this work we +propose a novel generalization of the heavy-ball momentum, and present FedHBM +to effectively address statistical heterogeneity in FL without introducing any +communication overhead. We conduct extensive experimentation on common FL +vision and NLP datasets, showing that our FedHBM algorithm empirically yields +better model quality and higher convergence speed w.r.t. the state-of-art, +especially in pathological non-iid scenarios. While being designed for +cross-silo settings, we show how FedHBM is applicable in moderate-to-high +cross-device scenarios, and how good model initializations (e.g. pre-training) +can be exploited for prompt acceleration. Extended experimentation on +large-scale real-world federated datasets further corroborates the +effectiveness of our approach for real-world FL applications. + +
+
+
+
+
+ + ☆ Fingerprint Matching with Localized Deep Representation + + +
+ Compared to minutia-based fingerprint representations, fixed-length +representations are attractive due to simple and efficient matching. However, +fixed-length fingerprint representations are limited in accuracy when matching +fingerprints with different visible areas, which can occur due to different +finger poses or acquisition methods. To address this issue, we propose a +localized deep representation of fingerprint, named LDRF. By focusing on the +discriminative characteristics within local regions, LDRF provides a more +robust and accurate fixed-length representation for fingerprints with variable +visible areas. LDRF can be adapted to retain information within any valid area, +making it highly flexible. The matching scores produced by LDRF also exhibit +intuitive statistical characteristics, which led us to propose a matching score +normalization technique to mitigate the uncertainty in the cases of very small +overlapping area. With this new technique, we can maintain a high level of +accuracy and reliability in our fingerprint matching, even as the size of the +database grows rapidly. Our experimental results on 21 datasets containing over +140K fingerprints of various finger poses and impression types show that LDRF +outperforms other fixed-length representations and is robust to sensing +technologies and impression types. Besides, the proposed matching score +normalization effectively reduces the false match rate (FMR) in large-scale +identification experiments comprising over 5.11 million fingerprints. +Specifically, this technique results in a reduction of two orders of magnitude +compared to matching without matching score normalization and five orders of +magnitude compared to prior works. + +
+
+ comment: 18 pages, 20 figures +
+
+
+
+
+ + ☆ Overcoming Label Noise for Source-free Unsupervised Video Domain + Adaptation + + +
+ Despite the progress seen in classification methods, current approaches for +handling videos with distribution shifts in source and target domains remain +source-dependent as they require access to the source data during the +adaptation stage. In this paper, we present a self-training based source-free +video domain adaptation approach to address this challenge by bridging the gap +between the source and the target domains. We use the source pre-trained model +to generate pseudo-labels for the target domain samples, which are inevitably +noisy. Thus, we treat the problem of source-free video domain adaptation as +learning from noisy labels and argue that the samples with correct +pseudo-labels can help us in adaptation. To this end, we leverage the +cross-entropy loss as an indicator of the correctness of the pseudo-labels and +use the resulting small-loss samples from the target domain for fine-tuning the +model. We further enhance the adaptation performance by implementing a +teacher-student framework, in which the teacher, which is updated gradually, +produces reliable pseudo-labels. Meanwhile, the student undergoes fine-tuning +on the target domain videos using these generated pseudo-labels to improve its +performance. Extensive experimental evaluations show that our methods, termed +as CleanAdapt, CleanAdapt + TS, achieve state-of-the-art results, outperforming +the existing approaches on various open datasets. Our source code is publicly +available at https://avijit9.github.io/CleanAdapt. + +
+
+ comment: Extended version of our ICVGIP paper +
+
+
+
+
+ + ☆ Seam-guided local alignment and stitching for large parallax images + + +
+ Seam-cutting methods have been proven effective in the composition step of +image stitching, especially for images with parallax. However, the +effectiveness of seam-cutting usually depends on that images can be roughly +aligned such that there exists a local region where a plausible seam can be +found. For images with large parallax, current alignment methods often fall +short of expectations. In this paper, we propose a local alignment and +stitching method guided by seam quality evaluation. First, we use existing +image alignment and seam-cutting methods to calculate an initial seam and +evaluate the quality of pixels along the seam. Then, for pixels with low +qualities, we separate their enclosing patches in the aligned images and +locally align them by extracting modified dense correspondences via SIFT flow. +Finally, we composite the aligned patches via seam-cutting and merge them into +the original aligned result to generate the final mosaic. Experiments show that +compared with the state-of-the-art seam-cutting methods, our result is more +plausible and with fewer artifacts. The code will be available at +https://github.com/tlliao/Seam-guided-local-alignment. + +
+
+ comment: 13 pages, 12 figures, in peer review +
+
+
+
+
+ + ☆ Periodic Vibration Gaussian: Dynamic Urban Scene Reconstruction and + Real-time Rendering + + +
+ Modeling dynamic, large-scale urban scenes is challenging due to their highly +intricate geometric structures and unconstrained dynamics in both space and +time. Prior methods often employ high-level architectural priors, separating +static and dynamic elements, resulting in suboptimal capture of their +synergistic interactions. To address this challenge, we present a unified +representation model, called Periodic Vibration Gaussian (PVG). PVG builds upon +the efficient 3D Gaussian splatting technique, originally designed for static +scene representation, by introducing periodic vibration-based temporal +dynamics. This innovation enables PVG to elegantly and uniformly represent the +characteristics of various objects and elements in dynamic urban scenes. To +enhance temporally coherent representation learning with sparse training data, +we introduce a novel flow-based temporal smoothing mechanism and a +position-aware adaptive control strategy. Extensive experiments on Waymo Open +Dataset and KITTI benchmarks demonstrate that PVG surpasses state-of-the-art +alternatives in both reconstruction and novel view synthesis for both dynamic +and static scenes. Notably, PVG achieves this without relying on manually +labeled object bounding boxes or expensive optical flow estimation. Moreover, +PVG exhibits 50/6000-fold acceleration in training/rendering over the best +alternative. + +
+
+ comment: Project page: https://fudan-zvg.github.io/PVG/ +
+
+
+
+
+ + ☆ FediOS: Decoupling Orthogonal Subspaces for Personalization in + Feature-skew Federated Learning + + +
+ Personalized federated learning (pFL) enables collaborative training among +multiple clients to enhance the capability of customized local models. In pFL, +clients may have heterogeneous (also known as non-IID) data, which poses a key +challenge in how to decouple the data knowledge into generic knowledge for +global sharing and personalized knowledge for preserving local personalization. +A typical way of pFL focuses on label distribution skew, and they adopt a +decoupling scheme where the model is split into a common feature extractor and +two prediction heads (generic and personalized). However, such a decoupling +scheme cannot solve the essential problem of feature skew heterogeneity, +because a common feature extractor cannot decouple the generic and personalized +features. Therefore, in this paper, we rethink the architecture decoupling +design for feature-skew pFL and propose an effective pFL method called FediOS. +In FediOS, we reformulate the decoupling into two feature extractors (generic +and personalized) and one shared prediction head. Orthogonal projections are +used for clients to map the generic features into one common subspace and +scatter the personalized features into different subspaces to achieve +decoupling for them. In addition, a shared prediction head is trained to +balance the importance of generic and personalized features during inference. +Extensive experiments on four vision datasets demonstrate our method reaches +state-of-the-art pFL performances under feature skew heterogeneity. + +
+
+
+
+
+ + ☆ Heterogeneous Graph-based Trajectory Prediction using Local Map Context + and Social Interactions SC 2023 + + +
+ Precisely predicting the future trajectories of surrounding traffic +participants is a crucial but challenging problem in autonomous driving, due to +complex interactions between traffic agents, map context and traffic rules. +Vector-based approaches have recently shown to achieve among the best +performances on trajectory prediction benchmarks. These methods model simple +interactions between traffic agents but don't distinguish between relation-type +and attributes like their distance along the road. Furthermore, they represent +lanes only by sequences of vectors representing center lines and ignore context +information like lane dividers and other road elements. We present a novel +approach for vector-based trajectory prediction that addresses these +shortcomings by leveraging three crucial sources of information: First, we +model interactions between traffic agents by a semantic scene graph, that +accounts for the nature and important features of their relation. Second, we +extract agent-centric image-based map features to model the local map context. +Finally, we generate anchor paths to enforce the policy in multi-modal +prediction to permitted trajectories only. Each of these three enhancements +shows advantages over the baseline model HoliGraph. + +
+
+ comment: Accepted on IEEE ITSC 2023 +
+
+
+
+
+ + ☆ Match me if you can: Semantic Correspondence Learning with Unpaired + Images + + +
+ Recent approaches for semantic correspondence have focused on obtaining +high-quality correspondences using a complicated network, refining the +ambiguous or noisy matching points. Despite their performance improvements, +they remain constrained by the limited training pairs due to costly point-level +annotations. This paper proposes a simple yet effective method that performs +training with unlabeled pairs to complement both limited image pairs and sparse +point pairs, requiring neither extra labeled keypoints nor trainable modules. +We fundamentally extend the data quantity and variety by augmenting new +unannotated pairs not primitively provided as training pairs in benchmarks. +Using a simple teacher-student framework, we offer reliable pseudo +correspondences to the student network via machine supervision. Finally, the +performance of our network is steadily improved by the proposed iterative +training, putting back the student as a teacher to generate refined labels and +train a new student repeatedly. Our models outperform the milestone baselines, +including state-of-the-art methods on semantic correspondence benchmarks. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ MaXTron: Mask Transformer with Trajectory Attention for Video Panoptic + Segmentation + + +
+ Video panoptic segmentation requires consistently segmenting (for both +`thing' and `stuff' classes) and tracking objects in a video over time. In this +work, we present MaXTron, a general framework that exploits Mask XFormer with +Trajectory Attention to tackle the task. MaXTron enriches an off-the-shelf mask +transformer by leveraging trajectory attention. The deployed mask transformer +takes as input a short clip consisting of only a few frames and predicts the +clip-level segmentation. To enhance the temporal consistency, MaXTron employs +within-clip and cross-clip tracking modules, efficiently utilizing trajectory +attention. Originally designed for video classification, trajectory attention +learns to model the temporal correspondences between neighboring frames and +aggregates information along the estimated motion paths. However, it is +nontrivial to directly extend trajectory attention to the per-pixel dense +prediction tasks due to its quadratic dependency on input size. To alleviate +the issue, we propose to adapt the trajectory attention for both the dense +pixel features and object queries, aiming to improve the short-term and +long-term tracking results, respectively. Particularly, in our within-clip +tracking module, we propose axial-trajectory attention that effectively +computes the trajectory attention for tracking dense pixels sequentially along +the height- and width-axes. The axial decomposition significantly reduces the +computational complexity for dense pixel features. In our cross-clip tracking +module, since the object queries in mask transformer are learned to encode the +object information, we are able to capture the long-term temporal connections +by applying trajectory attention to object queries, which learns to track each +object across different clips. Without bells and whistles, MaXTron demonstrates +state-of-the-art performances on video segmentation benchmarks. + +
+
+ comment: Code at https://github.com/TACJu/MaXTron +
+
+
+
+
+ + ☆ Dataset Distillation via the Wasserstein Metric + + +
+ Dataset distillation (DD) offers a compelling approach in computer vision, +with the goal of condensing extensive datasets into smaller synthetic versions +without sacrificing much of the model performance. In this paper, we continue +to study the methods for DD, by addressing its conceptually core objective: how +to capture the essential representation of extensive datasets in smaller, +synthetic forms. + We propose a novel approach utilizing the Wasserstein distance, a metric +rooted in optimal transport theory, to enhance distribution matching in DD. Our +method leverages the Wasserstein barycenter, offering a geometrically +meaningful way to quantify distribution differences and effectively capture the +centroid of a set of distributions. Our approach retains the computational +benefits of distribution matching-based methods while achieving new +state-of-the-art performance on several benchmarks. + To provide useful prior for learning the images, we embed the synthetic data +into the feature space of pretrained classification models to conduct +distribution matching. Extensive testing on various high-resolution datasets +confirms the effectiveness and adaptability of our method, indicating the +promising yet unexplored capabilities of Wasserstein metrics in dataset +distillation. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Color-Emotion Associations in Art: Fuzzy Approach + + +
+ Art objects can evoke certain emotions. Color is a fundamental element of +visual art and plays a significant role in how art is perceived. This paper +introduces a novel approach to classifying emotions in art using Fuzzy Sets. We +employ a fuzzy approach because it aligns well with human judgments' imprecise +and subjective nature. Extensive fuzzy colors (n=120) and a broad emotional +spectrum (n=10) allow for a more human-consistent and context-aware exploration +of emotions inherent in paintings. First, we introduce the fuzzy color +representation model. Then, at the fuzzification stage, we process the Wiki Art +Dataset of paintings tagged with emotions, extracting fuzzy dominant colors +linked to specific emotions. This results in fuzzy color distributions for ten +emotions. Finally, we convert them back to a crisp domain, obtaining a +knowledge base of color-emotion associations in primary colors. Our findings +reveal strong associations between specific emotions and colors; for instance, +gratitude strongly correlates with green, brown, and orange. Other noteworthy +associations include brown and anger, orange with shame, yellow with happiness, +and gray with fear. Using these associations and Jaccard similarity, we can +find the emotions in the arbitrary untagged image. We conducted a 2AFC +experiment involving human subjects to evaluate the proposed method. The +average hit rate of 0.77 indicates a significant correlation between the +method's predictions and human perception. The proposed method is simple to +adapt to art painting retrieval systems. The study contributes to the +theoretical understanding of color-emotion associations in art, offering +valuable insights for various practical applications besides art, like +marketing, design, and psychology. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Revisiting Proposal-based Object Detection + + +
+ This paper revisits the pipeline for detecting objects in images with +proposals. For any object detector, the obtained box proposals or queries need +to be classified and regressed towards ground truth boxes. The common solution +for the final predictions is to directly maximize the overlap between each +proposal and the ground truth box, followed by a winner-takes-all ranking or +non-maximum suppression. In this work, we propose a simple yet effective +alternative. For proposal regression, we solve a simpler problem where we +regress to the area of intersection between proposal and ground truth. In this +way, each proposal only specifies which part contains the object, avoiding a +blind inpainting problem where proposals need to be regressed beyond their +visual scope. In turn, we replace the winner-takes-all strategy and obtain the +final prediction by taking the union over the regressed intersections of a +proposal group surrounding an object. Our revisited approach comes with minimal +changes to the detection pipeline and can be plugged into any existing method. +We show that our approach directly improves canonical object detection and +instance segmentation architectures, highlighting the utility of +intersection-based regression and grouping. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ DifAugGAN: A Practical Diffusion-style Data Augmentation for GAN-based + Single Image Super-resolution + + +
+ It is well known the adversarial optimization of GAN-based image +super-resolution (SR) methods makes the preceding SR model generate unpleasant +and undesirable artifacts, leading to large distortion. We attribute the cause +of such distortions to the poor calibration of the discriminator, which hampers +its ability to provide meaningful feedback to the generator for learning +high-quality images. To address this problem, we propose a simple but +non-travel diffusion-style data augmentation scheme for current GAN-based SR +methods, known as DifAugGAN. It involves adapting the diffusion process in +generative diffusion models for improving the calibration of the discriminator +during training motivated by the successes of data augmentation schemes in the +field to achieve good calibration. Our DifAugGAN can be a Plug-and-Play +strategy for current GAN-based SISR methods to improve the calibration of the +discriminator and thus improve SR performance. Extensive experimental +evaluations demonstrate the superiority of DifAugGAN over state-of-the-art +GAN-based SISR methods across both synthetic and real-world datasets, +showcasing notable advancements in both qualitative and quantitative results. + +
+
+
+
+
+ + ☆ Accurate Segmentation of Optic Disc And Cup from Multiple Pseudo-labels + by Noise-Aware Learning SC + + +
+ Optic disc and cup segmentation play a crucial role in automating the +screening and diagnosis of optic glaucoma. While data-driven convolutional +neural networks (CNNs) show promise in this area, the inherent ambiguity of +segmenting object and background boundaries in the task of optic disc and cup +segmentation leads to noisy annotations that impact model performance. To +address this, we propose an innovative label-denoising method of Multiple +Pseudo-labels Noise-aware Network (MPNN) for accurate optic disc and cup +segmentation. Specifically, the Multiple Pseudo-labels Generation and Guided +Denoising (MPGGD) module generates pseudo-labels by multiple different +initialization networks trained on true labels, and the pixel-level consensus +information extracted from these pseudo-labels guides to differentiate clean +pixels from noisy pixels. The training framework of the MPNN is constructed by +a teacher-student architecture to learn segmentation from clean pixels and +noisy pixels. Particularly, such a framework adeptly leverages (i) reliable and +fundamental insights from clean pixels and (ii) the supplementary knowledge +within noisy pixels via multiple perturbation-based unsupervised consistency. +Compared to other label-denoising methods, comprehensive experimental results +on the RIGA dataset demonstrate our method's excellent performance and +significant denoising ability. + +
+
+ comment: to CSCWD 2024 +
+
+
+
+
+ + ☆ Improving Adversarial Transferability via Model Alignment + + +
+ Neural networks are susceptible to adversarial perturbations that are +transferable across different models. In this paper, we introduce a novel model +alignment technique aimed at improving a given source model's ability in +generating transferable adversarial perturbations. During the alignment +process, the parameters of the source model are fine-tuned to minimize an +alignment loss. This loss measures the divergence in the predictions between +the source model and another, independently trained model, referred to as the +witness model. To understand the effect of model alignment, we conduct a +geometric anlaysis of the resulting changes in the loss landscape. Extensive +experiments on the ImageNet dataset, using a variety of model architectures, +demonstrate that perturbations generated from aligned source models exhibit +significantly higher transferability than those from the original source model. + +
+
+
+
+
+ + ☆ PRS: Sharp Feature Priors for Resolution-Free Surface Remeshing + + +
+ Surface reconstruction with preservation of geometric features is a +challenging computer vision task. Despite significant progress in implicit +shape reconstruction, state-of-the-art mesh extraction methods often produce +aliased, perceptually distorted surfaces and lack scalability to +high-resolution 3D shapes. We present a data-driven approach for automatic +feature detection and remeshing that requires only a coarse, aliased mesh as +input and scales to arbitrary resolution reconstructions. We define and learn a +collection of surface-based fields to (1) capture sharp geometric features in +the shape with an implicit vertexwise model and (2) approximate improvements in +normals alignment obtained by applying edge-flips with an edgewise model. To +support scaling to arbitrary complexity shapes, we learn our fields using local +triangulated patches, fusing estimates on complete surface meshes. Our feature +remeshing algorithm integrates the learned fields as sharp feature priors and +optimizes vertex placement and mesh connectivity for maximum expected surface +improvement. On a challenging collection of high-resolution shape +reconstructions in the ABC dataset, our algorithm improves over +state-of-the-art by 26% normals F-score and 42% perceptual +$\text{RMSE}_{\text{v}}$. + +
+
+
+
+
+ + ☆ ZeST-NeRF: Using temporal aggregation for Zero-Shot Temporal NeRFs BMVC 2023 + + +
+ In the field of media production, video editing techniques play a pivotal +role. Recent approaches have had great success at performing novel view image +synthesis of static scenes. But adding temporal information adds an extra layer +of complexity. Previous models have focused on implicitly representing static +and dynamic scenes using NeRF. These models achieve impressive results but are +costly at training and inference time. They overfit an MLP to describe the +scene implicitly as a function of position. This paper proposes ZeST-NeRF, a +new approach that can produce temporal NeRFs for new scenes without retraining. +We can accurately reconstruct novel views using multi-view synthesis techniques +and scene flow-field estimation, trained only with unrelated scenes. We +demonstrate how existing state-of-the-art approaches from a range of fields +cannot adequately solve this new task and demonstrate the efficacy of our +solution. The resulting network improves quantitatively by 15% and produces +significantly better visual results. + +
+
+ comment: VUA BMVC 2023 +
+
+
+
+
+ + ☆ Language Embedded 3D Gaussians for Open-Vocabulary Scene Understanding + + +
+ Open-vocabulary querying in 3D space is challenging but essential for scene +understanding tasks such as object localization and segmentation. +Language-embedded scene representations have made progress by incorporating +language features into 3D spaces. However, their efficacy heavily depends on +neural networks that are resource-intensive in training and rendering. Although +recent 3D Gaussians offer efficient and high-quality novel view synthesis, +directly embedding language features in them leads to prohibitive memory usage +and decreased performance. In this work, we introduce Language Embedded 3D +Gaussians, a novel scene representation for open-vocabulary query tasks. +Instead of embedding high-dimensional raw semantic features on 3D Gaussians, we +propose a dedicated quantization scheme that drastically alleviates the memory +requirement, and a novel embedding procedure that achieves smoother yet high +accuracy query, countering the multi-view feature inconsistencies and the +high-frequency inductive bias in point-based representations. Our comprehensive +experiments show that our representation achieves the best visual quality and +language querying accuracy across current language-embedded representations, +while maintaining real-time rendering frame rates on a single desktop GPU. + +
+
+
+
+
+ + ☆ ESG Accountability Made Easy: DocQA at Your Service AAAI + + +
+ We present Deep Search DocQA. This application enables information extraction +from documents via a question-answering conversational assistant. The system +integrates several technologies from different AI disciplines consisting of +document conversion to machine-readable format (via computer vision), finding +relevant data (via natural language processing), and formulating an eloquent +response (via large language models). Users can explore over 10,000 +Environmental, Social, and Governance (ESG) disclosure reports from over 2000 +corporations. The Deep Search platform can be accessed at: +https://ds4sd.github.io. + +
+
+ comment: Accepted at the Demonstration Track of the 38th Annual AAAI + Conference on Artificial Intelligence (AAAI 24) +
+
+
+
+
+ + ☆ HOLD: Category-agnostic 3D Reconstruction of Interacting Hands and + Objects from Video + + +
+ Since humans interact with diverse objects every day, the holistic 3D capture +of these interactions is important to understand and model human behaviour. +However, most existing methods for hand-object reconstruction from RGB either +assume pre-scanned object templates or heavily rely on limited 3D hand-object +data, restricting their ability to scale and generalize to more unconstrained +interaction settings. To this end, we introduce HOLD -- the first +category-agnostic method that reconstructs an articulated hand and object +jointly from a monocular interaction video. We develop a compositional +articulated implicit model that can reconstruct disentangled 3D hand and object +from 2D images. We also further incorporate hand-object constraints to improve +hand-object poses and consequently the reconstruction quality. Our method does +not rely on 3D hand-object annotations while outperforming fully-supervised +baselines in both in-the-lab and challenging in-the-wild settings. Moreover, we +qualitatively show its robustness in reconstructing from in-the-wild videos. +Code: https://github.com/zc-alexfan/hold + +
+
+
+
+
+ + ☆ VTimeLLM: Empower LLM to Grasp Video Moments + + +
+ Large language models (LLMs) have shown remarkable text understanding +capabilities, which have been extended as Video LLMs to handle video data for +comprehending visual details. However, existing Video LLMs can only provide a +coarse description of the entire video, failing to capture the precise start +and end time boundary of specific events. In this paper, we solve this issue +via proposing VTimeLLM, a novel Video LLM designed for fine-grained video +moment understanding and reasoning with respect to time boundary. Specifically, +our VTimeLLM adopts a boundary-aware three-stage training strategy, which +respectively utilizes image-text pairs for feature alignment, multiple-event +videos to increase temporal-boundary awareness, and high-quality +video-instruction tuning to further improve temporal understanding ability as +well as align with human intents. Extensive experiments demonstrate that in +fine-grained time-related comprehension tasks for videos such as Temporal Video +Grounding and Dense Video Captioning, VTimeLLM significantly outperforms +existing Video LLMs. Besides, benefits from the fine-grained temporal +understanding of the videos further enable VTimeLLM to beat existing Video LLMs +in video dialogue benchmark, showing its superior cross-modal understanding and +reasoning abilities. + +
+
+
+
+
+ + ☆ Layered Rendering Diffusion Model for Zero-Shot Guided Image Synthesis + + +
+ This paper introduces innovative solutions to enhance spatial controllability +in diffusion models reliant on text queries. We present two key innovations: +Vision Guidance and the Layered Rendering Diffusion (LRDiff) framework. Vision +Guidance, a spatial layout condition, acts as a clue in the perturbed +distribution, greatly narrowing down the search space, to focus on the image +sampling process adhering to the spatial layout condition. The LRDiff framework +constructs an image-rendering process with multiple layers, each of which +applies the vision guidance to instructively estimate the denoising direction +for a single object. Such a layered rendering strategy effectively prevents +issues like unintended conceptual blending or mismatches, while allowing for +more coherent and contextually accurate image synthesis. The proposed method +provides a more efficient and accurate means of synthesising images that align +with specific spatial and contextual requirements. We demonstrate through our +experiments that our method provides better results than existing techniques +both quantitatively and qualitatively. We apply our method to three practical +applications: bounding box-to-image, semantic mask-to-image and image editing. + +
+
+
+
+
+ + ☆ E2PNet: Event to Point Cloud Registration with Spatio-Temporal + Representation Learning NeurIPS 2023 + + +
+ Event cameras have emerged as a promising vision sensor in recent years due +to their unparalleled temporal resolution and dynamic range. While registration +of 2D RGB images to 3D point clouds is a long-standing problem in computer +vision, no prior work studies 2D-3D registration for event cameras. To this +end, we propose E2PNet, the first learning-based method for event-to-point +cloud registration. The core of E2PNet is a novel feature representation +network called Event-Points-to-Tensor (EP2T), which encodes event data into a +2D grid-shaped feature tensor. This grid-shaped feature enables matured +RGB-based frameworks to be easily used for event-to-point cloud registration, +without changing hyper-parameters and the training procedure. EP2T treats the +event input as spatio-temporal point clouds. Unlike standard 3D learning +architectures that treat all dimensions of point clouds equally, the novel +sampling and information aggregation modules in EP2T are designed to handle the +inhomogeneity of the spatial and temporal dimensions. Experiments on the MVSEC +and VECtor datasets demonstrate the superiority of E2PNet over hand-crafted and +other learning-based methods. Compared to RGB-based registration, E2PNet is +more robust to extreme illumination or fast motion due to the use of event +data. Beyond 2D-3D registration, we also show the potential of EP2T for other +vision tasks such as flow estimation, event-to-image reconstruction and object +recognition. The source code can be found at: +https://github.com/Xmu-qcj/E2PNet. + +
+
+ comment: 10 pages, 4 figures, accepted by Thirty-seventh Conference on Neural + Information Processing Systems(NeurIPS 2023) +
+
+
+
+
+ + ☆ TeG-DG: Textually Guided Domain Generalization for Face Anti-Spoofing + + +
+ Enhancing the domain generalization performance of Face Anti-Spoofing (FAS) +techniques has emerged as a research focus. Existing methods are dedicated to +extracting domain-invariant features from various training domains. Despite the +promising performance, the extracted features inevitably contain residual style +feature bias (e.g., illumination, capture device), resulting in inferior +generalization performance. In this paper, we propose an alternative and +effective solution, the Textually Guided Domain Generalization (TeG-DG) +framework, which can effectively leverage text information for cross-domain +alignment. Our core insight is that text, as a more abstract and universal form +of expression, can capture the commonalities and essential characteristics +across various attacks, bridging the gap between different image domains. +Contrary to existing vision-language models, the proposed framework is +elaborately designed to enhance the domain generalization ability of the FAS +task. Concretely, we first design a Hierarchical Attention Fusion (HAF) module +to enable adaptive aggregation of visual features at different levels; Then, a +Textual-Enhanced Visual Discriminator (TEVD) is proposed for not only better +alignment between the two modalities but also to regularize the classifier with +unbiased text features. TeG-DG significantly outperforms previous approaches, +especially in situations with extremely limited source domain data (~14% and +~12% improvements on HTER and AUC respectively), showcasing impressive few-shot +performance. + +
+
+
+
+
+ + ☆ CAT-DM: Controllable Accelerated Virtual Try-on with Diffusion Model + + +
+ Image-based virtual try-on enables users to virtually try on different +garments by altering original clothes in their photographs. Generative +Adversarial Networks (GANs) dominate the research field in image-based virtual +try-on, but have not resolved problems such as unnatural deformation of +garments and the blurry generation quality. Recently, diffusion models have +emerged with surprising performance across various image generation tasks. +While the generative quality of diffusion models is impressive, achieving +controllability poses a significant challenge when applying it to virtual +try-on tasks and multiple denoising iterations limit its potential for +real-time applications. In this paper, we propose Controllable Accelerated +virtual Try-on with Diffusion Model called CAT-DM. To enhance the +controllability, a basic diffusion-based virtual try-on network is designed, +which utilizes ControlNet to introduce additional control conditions and +improves the feature extraction of garment images. In terms of acceleration, +CAT-DM initiates a reverse denoising process with an implicit distribution +generated by a pre-trained GAN-based model. Compared with previous try-on +methods based on diffusion models, CAT-DM not only retains the pattern and +texture details of the in-shop garment but also reduces the sampling steps +without compromising generation quality. Extensive experiments demonstrate the +superiority of CAT-DM against both GAN-based and diffusion-based methods in +producing more realistic images and accurately reproducing garment patterns. +Our code and models will be publicly released. + +
+
+
+
+
+ + ☆ Corrupting Convolution-based Unlearnable Datasets with Pixel-based Image + Transformations + + +
+ Unlearnable datasets lead to a drastic drop in the generalization performance +of models trained on them by introducing elaborate and imperceptible +perturbations into clean training sets. Many existing defenses, e.g., JPEG +compression and adversarial training, effectively counter UDs based on +norm-constrained additive noise. However, a fire-new type of convolution-based +UDs have been proposed and render existing defenses all ineffective, presenting +a greater challenge to defenders. To address this, we express the +convolution-based unlearnable sample as the result of multiplying a matrix by a +clean sample in a simplified scenario, and formalize the intra-class matrix +inconsistency as $\Theta_{imi}$, inter-class matrix consistency as +$\Theta_{imc}$ to investigate the working mechanism of the convolution-based +UDs. We conjecture that increasing both of these metrics will mitigate the +unlearnability effect. Through validation experiments that commendably support +our hypothesis, we further design a random matrix to boost both $\Theta_{imi}$ +and $\Theta_{imc}$, achieving a notable degree of defense effect. Hence, by +building upon and extending these facts, we first propose a brand-new image +COrruption that employs randomly multiplicative transformation via +INterpolation operation to successfully defend against convolution-based UDs. +Our approach leverages global pixel random interpolations, effectively +suppressing the impact of multiplicative noise in convolution-based UDs. +Additionally, we have also designed two new forms of convolution-based UDs, and +find that our defense is the most effective against them. + +
+
+
+
+
+ + ☆ MV-CLIP: Multi-View CLIP for Zero-shot 3D Shape Recognition + + +
+ Large-scale pre-trained models have demonstrated impressive performance in +vision and language tasks within open-world scenarios. Due to the lack of +comparable pre-trained models for 3D shapes, recent methods utilize +language-image pre-training to realize zero-shot 3D shape recognition. However, +due to the modality gap, pretrained language-image models are not confident +enough in the generalization to 3D shape recognition. Consequently, this paper +aims to improve the confidence with view selection and hierarchical prompts. +Leveraging the CLIP model as an example, we employ view selection on the vision +side by identifying views with high prediction confidence from multiple +rendered views of a 3D shape. On the textual side, the strategy of hierarchical +prompts is proposed for the first time. The first layer prompts several +classification candidates with traditional class-level descriptions, while the +second layer refines the prediction based on function-level descriptions or +further distinctions between the candidates. Remarkably, without the need for +additional training, our proposed method achieves impressive zero-shot 3D +classification accuracies of 84.44\%, 91.51\%, and 66.17\% on ModelNet40, +ModelNet10, and ShapeNet Core55, respectively. Furthermore, we will make the +code publicly available to facilitate reproducibility and further research in +this area. + +
+
+
+
+
+ + ☆ RainAI -- Precipitation Nowcasting from Satellite Data + + +
+ This paper presents a solution to the Weather4Cast 2023 competition, where +the goal is to forecast high-resolution precipitation with an 8-hour lead time +using lower-resolution satellite radiance images. We propose a simple, yet +effective method for spatiotemporal feature learning using a 2D U-Net model, +that outperforms the official 3D U-Net baseline in both performance and +efficiency. We place emphasis on refining the dataset, through importance +sampling and dataset preparation, and show that such techniques have a +significant impact on performance. We further study an alternative +cross-entropy loss function that improves performance over the standard mean +squared error loss, while also enabling models to produce probabilistic +outputs. Additional techniques are explored regarding the generation of +predictions at different lead times, specifically through Conditioning Lead +Time. Lastly, to generate high-resolution forecasts, we evaluate standard and +learned upsampling methods. The code and trained parameters are available at +https://github.com/rafapablos/w4c23-rainai. + +
+
+
+
+
+ + ☆ On Exact Inversion of DPM-Solvers + + +
+ Diffusion probabilistic models (DPMs) are a key component in modern +generative models. DPM-solvers have achieved reduced latency and enhanced +quality significantly, but have posed challenges to find the exact inverse +(i.e., finding the initial noise from the given image). Here we investigate the +exact inversions for DPM-solvers and propose algorithms to perform them when +samples are generated by the first-order as well as higher-order DPM-solvers. +For each explicit denoising step in DPM-solvers, we formulated the inversions +using implicit methods such as gradient descent or forward step method to +ensure the robustness to large classifier-free guidance unlike the prior +approach using fixed-point iteration. Experimental results demonstrated that +our proposed exact inversion methods significantly reduced the error of both +image and noise reconstructions, greatly enhanced the ability to distinguish +invisible watermarks and well prevented unintended background changes +consistently during image editing. Project page: +\url{https://smhongok.github.io/inv-dpm.html}. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ A Survey on Deep Learning for Polyp Segmentation: Techniques, Challenges + and Future Trends + + +
+ Early detection and assessment of polyps play a crucial role in the +prevention and treatment of colorectal cancer (CRC). Polyp segmentation +provides an effective solution to assist clinicians in accurately locating and +segmenting polyp regions. In the past, people often relied on manually +extracted lower-level features such as color, texture, and shape, which often +had issues capturing global context and lacked robustness to complex scenarios. +With the advent of deep learning, more and more outstanding medical image +segmentation algorithms based on deep learning networks have emerged, making +significant progress in this field. This paper provides a comprehensive review +of polyp segmentation algorithms. We first review some traditional algorithms +based on manually extracted features and deep segmentation algorithms, then +detail benchmark datasets related to the topic. Specifically, we carry out a +comprehensive evaluation of recent deep learning models and results based on +polyp sizes, considering the pain points of research topics and differences in +network structures. Finally, we discuss the challenges of polyp segmentation +and future trends in this field. The models, benchmark datasets, and source +code links we collected are all published at +https://github.com/taozh2017/Awesome-Polyp-Segmentation. + +
+
+ comment: 15 pages, 7 figures, and +
+
+
+
+
+ + ☆ Each Test Image Deserves A Specific Prompt: Continual Test-Time + Adaptation for 2D Medical Image Segmentation + + +
+ Distribution shift widely exists in medical images acquired from different +medical centres and poses a significant obstacle to deploying the pre-trained +semantic segmentation model in real-world applications. Test-time adaptation +has proven its effectiveness in tackling the cross-domain distribution shift +during inference. However, most existing methods achieve adaptation by updating +the pre-trained models, rendering them susceptible to error accumulation and +catastrophic forgetting when encountering a series of distribution shifts +(i.e., under the continual test-time adaptation setup). To overcome these +challenges caused by updating the models, in this paper, we freeze the +pre-trained model and propose the Visual Prompt-based Test-Time Adaptation +(VPTTA) method to train a specific prompt for each test image to align the +statistics in the batch normalization layers. Specifically, we present the +low-frequency prompt, which is lightweight with only a few parameters and can +be effectively trained in a single iteration. To enhance prompt initialization, +we equip VPTTA with a memory bank to benefit the current prompt from previous +ones. Additionally, we design a warm-up mechanism, which mixes source and +target statistics to construct warm-up statistics, thereby facilitating the +training process. Extensive experiments demonstrate the superiority of our +VPTTA over other state-of-the-art methods on two medical image segmentation +benchmark tasks. The code and weights of pre-trained source models are +available at https://github.com/Chen-Ziyang/VPTTA. + +
+
+
+
+
+ + ☆ Automating lookahead planning using site appearance and space + utilization + + +
+ This study proposes a method to automate the development of lookahead +planning. The proposed method uses construction material conditions (i.e., +appearances) and site space utilization to predict task completion rates. A +Gated Recurrent Unit (GRU) based Recurrent Neural Network (RNN) model was +trained using a segment of a construction project timeline to estimate +completion rates of tasks and propose data-aware lookahead plans. The proposed +method was evaluated in a sample construction project involving finishing works +such as plastering, painting, and installing electrical fixtures. The results +show that the proposed method can assist with developing automated lookahead +plans. In doing so, this study links construction planning with actual events +at the construction site. It extends the traditional scheduling techniques and +integrates a broader spectrum of site spatial constraints into lookahead +planning. + +
+
+
+
+
+ + ☆ TIDE: Test Time Few Shot Object Detection + + +
+ Few-shot object detection (FSOD) aims to extract semantic knowledge from +limited object instances of novel categories within a target domain. Recent +advances in FSOD focus on fine-tuning the base model based on a few objects via +meta-learning or data augmentation. Despite their success, the majority of them +are grounded with parametric readjustment to generalize on novel objects, which +face considerable challenges in Industry 5.0, such as (i) a certain amount of +fine-tuning time is required, and (ii) the parameters of the constructed model +being unavailable due to the privilege protection, making the fine-tuning fail. +Such constraints naturally limit its application in scenarios with real-time +configuration requirements or within black-box settings. To tackle the +challenges mentioned above, we formalize a novel FSOD task, referred to as Test +TIme Few Shot DEtection (TIDE), where the model is un-tuned in the +configuration procedure. To that end, we introduce an asymmetric architecture +for learning a support-instance-guided dynamic category classifier. Further, a +cross-attention module and a multi-scale resizer are provided to enhance the +model performance. Experimental results on multiple few-shot object detection +platforms reveal that the proposed TIDE significantly outperforms existing +contemporary methods. The implementation codes are available at +https://github.com/deku-0621/TIDE + +
+
+
+
+
+ + ☆ DSeg: Direct Line Segments Detection + + +
+ This paper presents a model-driven approach to detect image line segments. +The approach incrementally detects segments on the gradient image using a +linear Kalman filter that estimates the supporting line parameters and their +associated variances. The algorithm is fast and robust with respect to image +noise and illumination variations, it allows the detection of longer line +segments than data-driven approaches, and does not require any tedious +parameters tuning. An extension of the algorithm that exploits a pyramidal +approach to enhance the quality of results is proposed. Results with varying +scene illumination and comparisons to classic existing approaches are +presented. + +
+
+
+
+
+ + ☆ Multilevel Saliency-Guided Self-Supervised Learning for Image Anomaly + Detection + + +
+ Anomaly detection (AD) is a fundamental task in computer vision. It aims to +identify incorrect image data patterns which deviate from the normal ones. +Conventional methods generally address AD by preparing augmented negative +samples to enforce self-supervised learning. However, these techniques +typically do not consider semantics during augmentation, leading to the +generation of unrealistic or invalid negative samples. Consequently, the +feature extraction network can be hindered from embedding critical features. In +this study, inspired by visual attention learning approaches, we propose +CutSwap, which leverages saliency guidance to incorporate semantic cues for +augmentation. Specifically, we first employ LayerCAM to extract multilevel +image features as saliency maps and then perform clustering to obtain multiple +centroids. To fully exploit saliency guidance, on each map, we select a pixel +pair from the cluster with the highest centroid saliency to form a patch pair. +Such a patch pair includes highly similar context information with dense +semantic correlations. The resulting negative sample is created by swapping the +locations of the patch pair. Compared to prior augmentation methods, CutSwap +generates more subtle yet realistic negative samples to facilitate quality +feature learning. Extensive experimental and ablative evaluations demonstrate +that our method achieves state-of-the-art AD performance on two mainstream AD +benchmark datasets. + +
+
+
+
+
+ + ☆ MRFP: Learning Generalizable Semantic Segmentation from Sim-2-Real with + Multi-Resolution Feature Perturbation + + +
+ Deep neural networks have shown exemplary performance on semantic scene +understanding tasks on source domains, but due to the absence of style +diversity during training, enhancing performance on unseen target domains using +only single source domain data remains a challenging task. Generation of +simulated data is a feasible alternative to retrieving large style-diverse +real-world datasets as it is a cumbersome and budget-intensive process. +However, the large domain-specific inconsistencies between simulated and +real-world data pose a significant generalization challenge in semantic +segmentation. In this work, to alleviate this problem, we propose a novel +MultiResolution Feature Perturbation (MRFP) technique to randomize +domain-specific fine-grained features and perturb style of coarse features. Our +experimental results on various urban-scene segmentation datasets clearly +indicate that, along with the perturbation of style-information, perturbation +of fine-feature components is paramount to learn domain invariant robust +feature maps for semantic segmentation models. MRFP is a simple and +computationally efficient, transferable module with no additional learnable +parameters or objective functions, that helps state-of-the-art deep neural +networks to learn robust domain invariant features for simulation-to-real +semantic segmentation. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Advances in 3D Neural Stylization: A Survey + + +
+ Modern artificial intelligence provides a novel way of producing digital art +in styles. The expressive power of neural networks enables the realm of visual +style transfer methods, which can be used to edit images, videos, and 3D data +to make them more artistic and diverse. This paper reports on recent advances +in neural stylization for 3D data. We provide a taxonomy for neural stylization +by considering several important design choices, including scene +representation, guidance data, optimization strategies, and output styles. +Building on such taxonomy, our survey first revisits the background of neural +stylization on 2D images, and then provides in-depth discussions on recent +neural stylization methods for 3D data, where we also provide a mini-benchmark +on artistic stylization methods. Based on the insights gained from the survey, +we then discuss open challenges, future research, and potential applications +and impacts of neural stylization. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Anisotropic Neural Representation Learning for High-Quality Neural + Rendering + + +
+ Neural radiance fields (NeRFs) have achieved impressive view synthesis +results by learning an implicit volumetric representation from multi-view +images. To project the implicit representation into an image, NeRF employs +volume rendering that approximates the continuous integrals of rays as an +accumulation of the colors and densities of the sampled points. Although this +approximation enables efficient rendering, it ignores the direction information +in point intervals, resulting in ambiguous features and limited reconstruction +quality. In this paper, we propose an anisotropic neural representation +learning method that utilizes learnable view-dependent features to improve +scene representation and reconstruction. We model the volumetric function as +spherical harmonic (SH)-guided anisotropic features, parameterized by +multilayer perceptrons, facilitating ambiguity elimination while preserving the +rendering efficiency. To achieve robust scene reconstruction without anisotropy +overfitting, we regularize the energy of the anisotropic features during +training. Our method is flexiable and can be plugged into NeRF-based +frameworks. Extensive experiments show that the proposed representation can +boost the rendering quality of various NeRFs and achieve state-of-the-art +rendering performance on both synthetic and real-world scenes. + +
+
+
+
+
+ + ☆ Categorical Traffic Transformer: Interpretable and Diverse Behavior + Prediction with Tokenized Latent + + +
+ Adept traffic models are critical to both planning and closed-loop simulation +for autonomous vehicles (AV), and key design objectives include accuracy, +diverse multimodal behaviors, interpretability, and downstream compatibility. +Recently, with the advent of large language models (LLMs), an additional +desirable feature for traffic models is LLM compatibility. We present +Categorical Traffic Transformer (CTT), a traffic model that outputs both +continuous trajectory predictions and tokenized categorical predictions (lane +modes, homotopies, etc.). The most outstanding feature of CTT is its fully +interpretable latent space, which enables direct supervision of the latent +variable from the ground truth during training and avoids mode collapse +completely. As a result, CTT can generate diverse behaviors conditioned on +different latent modes with semantic meanings while beating SOTA on prediction +accuracy. In addition, CTT's ability to input and output tokens enables +integration with LLMs for common-sense reasoning and zero-shot generalization. + +
+
+
+
+
+ + ☆ OmniMotionGPT: Animal Motion Generation with Limited Data + + +
+ Our paper aims to generate diverse and realistic animal motion sequences from +textual descriptions, without a large-scale animal text-motion dataset. While +the task of text-driven human motion synthesis is already extensively studied +and benchmarked, it remains challenging to transfer this success to other +skeleton structures with limited data. In this work, we design a model +architecture that imitates Generative Pretraining Transformer (GPT), utilizing +prior knowledge learned from human data to the animal domain. We jointly train +motion autoencoders for both animal and human motions and at the same time +optimize through the similarity scores among human motion encoding, animal +motion encoding, and text CLIP embedding. Presenting the first solution to this +problem, we are able to generate animal motions with high diversity and +fidelity, quantitatively and qualitatively outperforming the results of +training human motion generation baselines on animal data. Additionally, we +introduce AnimalML3D, the first text-animal motion dataset with 1240 animation +sequences spanning 36 different animal identities. We hope this dataset would +mediate the data scarcity problem in text-driven animal motion generation, +providing a new playground for the research community. + +
+
+ comment: The project page is at https://zshyang.github.io/omgpt-website/ +
+
+
+
+
+ + ☆ Reconstructing the normal and shape at specularities in endoscopy + + +
+ Specularities are numerous in endoscopic images. They occur as many white +small elliptic spots, which are generally ruled out as nuisance in image +analysis and computer vision methods. Instead, we propose to use specularities +as cues for 3D perception. Specifically, we propose a new method to +reconstruct, at each specularity, the observed tissue's normal direction (i.e., +its orientation) and shape (i.e., its curvature) from a single image. We show +results on simulated and real interventional images. + +
+
+
+
+
+ + ♻ ☆ Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis + + +
+ Hallucinations and unfaithful synthesis due to inaccurate prompts with +insufficient semantic details are widely observed in multimodal generative +models. A prevalent strategy to align multiple modalities is to fine-tune the +generator with a large number of annotated text-image pairs. However, such a +procedure is labor-consuming and resource-draining. The key question we ask is: +can we enhance the quality and faithfulness of text-driven generative models +beyond extensive text-image pair annotations? To address this question, we +propose Knowledge Pursuit Prompting (KPP), a zero-shot framework that +iteratively incorporates external knowledge to help generators produce reliable +visual content. Instead of training generators to handle generic prompts, KPP +employs a recursive knowledge query process to gather informative external +facts from the knowledge base, instructs a language model to compress the +acquired knowledge for prompt refinement, and utilizes text-driven generators +for visual synthesis. The entire process is zero-shot, without accessing the +architectures and parameters of generative models. We evaluate the framework +across multiple text-driven generative tasks (image, 3D rendering, and video) +on datasets of different domains. We further demonstrate the extensibility and +adaptability of KPP through varying foundation model bases and instructions. +Our results show that KPP is capable of generating faithful and semantically +rich content across diverse visual domains, offering a promising solution to +improve multimodal generative models. + +
+
+
+
+
+ + ♻ ☆ Charting New Territories: Exploring the Geographic and Geospatial + Capabilities of Multimodal LLMs + + +
+ Multimodal large language models (MLLMs) have shown remarkable capabilities +across a broad range of tasks but their knowledge and abilities in the +geographic and geospatial domains are yet to be explored, despite potential +wide-ranging benefits to navigation, environmental research, urban development, +and disaster response. We conduct a series of experiments exploring various +vision capabilities of MLLMs within these domains, particularly focusing on the +frontier model GPT-4V, and benchmark its performance against open-source +counterparts. Our methodology involves challenging these models with a +small-scale geographic benchmark consisting of a suite of visual tasks, testing +their abilities across a spectrum of complexity. The analysis uncovers not only +where such models excel, including instances where they outperform humans, but +also where they falter, providing a balanced view of their capabilities in the +geographic domain. To enable the comparison and evaluation of future models, +our benchmark will be publicly released. + +
+
+ comment: V2: Minor formatting changes and added missing subfigure captions +
+
+
+
+
+ + ♻ ☆ ID-Pose: Sparse-view Camera Pose Estimation by Inverting Diffusion + Models + + +
+ Given sparse views of a 3D object, estimating their camera poses is a +long-standing and intractable problem. Toward this goal, we consider harnessing +the pre-trained diffusion model of novel views conditioned on viewpoints +(Zero-1-to-3). We present ID-Pose which inverses the denoising diffusion +process to estimate the relative pose given two input images. ID-Pose adds a +noise to one image, and predicts the noise conditioned on the other image and a +hypothesis of the relative pose. The prediction error is used as the +minimization objective to find the optimal pose with the gradient descent +method. We extend ID-Pose to handle more than two images and estimate each pose +with multiple image pairs from triangular relations. ID-Pose requires no +training and generalizes to open-world images. We conduct extensive experiments +using casually captured photos and rendered images with random viewpoints. The +results demonstrate that ID-Pose significantly outperforms state-of-the-art +methods. + +
+
+ comment: Github: https://xt4d.github.io/id-pose-web/ +
+
+
+
+
+ + ♻ ☆ Learning by Aligning 2D Skeleton Sequences in Time + + +
+ This paper presents a self-supervised temporal video alignment framework +which is useful for several fine-grained human activity understanding +applications. In contrast with the state-of-the-art method of CASA, where +sequences of 3D skeleton coordinates are taken directly as input, our key idea +is to use sequences of 2D skeleton heatmaps as input. Unlike CASA which +performs self-attention in the temporal domain only, we feed 2D skeleton +heatmaps to a video transformer which performs self-attention both in the +spatial and temporal domains for extracting effective spatiotemporal and +contextual features. In addition, we introduce simple heatmap augmentation +techniques based on 2D skeletons for self-supervised learning. Despite the lack +of 3D information, our approach achieves not only higher accuracy but also +better robustness against missing and noisy keypoints than CASA. Furthermore, +extensive evaluations on three public datasets, i.e., Penn Action, IKEA ASM, +and H2O, demonstrate that our approach outperforms previous methods in +different fine-grained human activity understanding tasks. Finally, fusing 2D +skeleton heatmaps with RGB videos yields the state-of-the-art on all metrics +and datasets. To our best knowledge, our work is the first to utilize 2D +skeleton heatmap inputs and the first to explore multi-modality fusion for +temporal video alignment. + +
+
+
+
+
+ + ♻ ☆ A Fully Unsupervised Instance Segmentation Technique for White Blood + Cell Images + + +
+ White blood cells, also known as leukocytes are group of heterogeneously +nucleated cells which act as salient immune system cells. These are originated +in the bone marrow and are found in blood, plasma, and lymph tissues. +Leukocytes kill the bacteria, virus and other kind of pathogens which invade +human body through phagocytosis that in turn results immunity. Detection of a +white blood cell count can reveal camouflaged infections and warn doctors about +chronic medical conditions such as autoimmune diseases, immune deficiencies, +and blood disorders. Segmentation plays an important role in identification of +white blood cells (WBC) from microscopic image analysis. The goal of +segmentation in a microscopic image is to divide the image into different +distinct regions. In our paper, we tried to propose a novel instance +segmentation method for segmenting the WBCs containing both the nucleus and the +cytoplasm, from bone marrow images. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Robustness of Text-Image Composed Retrieval NeurIPS 2023 + + +
+ Text-image composed retrieval aims to retrieve the target image through the +composed query, which is specified in the form of an image plus some text that +describes desired modifications to the input image. It has recently attracted +attention due to its ability to leverage both information-rich images and +concise language to precisely express the requirements for target images. +However, the robustness of these approaches against real-world corruptions or +further text understanding has never been studied. In this paper, we perform +the first robustness study and establish three new diversified benchmarks for +systematic analysis of text-image composed retrieval against natural +corruptions in both vision and text and further probe textural understanding. +For natural corruption analysis, we introduce two new large-scale benchmark +datasets, CIRR-C and FashionIQ-C for testing in open domain and fashion domain +respectively, both of which apply 15 visual corruptions and 7 textural +corruptions. For textural understanding analysis, we introduce a new diagnostic +dataset CIRR-D by expanding the original raw data with synthetic data, which +contains modified text to better probe textual understanding ability including +numerical variation, attribute variation, object removal, background variation, +and fine-grained evaluation. The code and benchmark datasets are available at +https://github.com/SunTongtongtong/Benchmark-Robustness-Text-Image-Compose-Retrieval. + +
+
+ comment: Accepted by R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot + Learning in Foundation Models at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ MMOTU: A Multi-Modality Ovarian Tumor Ultrasound Image Dataset for + Unsupervised Cross-Domain Semantic Segmentation + + +
+ Ovarian cancer is one of the most harmful gynecological diseases. Detecting +ovarian tumors in early stage with computer-aided techniques can efficiently +decrease the mortality rate. With the improvement of medical treatment +standard, ultrasound images are widely applied in clinical treatment. However, +recent notable methods mainly focus on single-modality ultrasound ovarian tumor +segmentation or recognition, which means there still lacks researches on +exploring the representation capability of multi-modality ultrasound ovarian +tumor images. To solve this problem, we propose a Multi-Modality Ovarian Tumor +Ultrasound (MMOTU) image dataset containing 1469 2d ultrasound images and 170 +contrast enhanced ultrasonography (CEUS) images with pixel-wise and global-wise +annotations. Based on MMOTU, we mainly focus on unsupervised cross-domain +semantic segmentation task. To solve the domain shift problem, we propose a +feature alignment based architecture named Dual-Scheme Domain-Selected Network +(DS2Net). Specifically, we first design source-encoder and target-encoder to +extract two-style features of source and target images. Then, we propose +Domain-Distinct Selected Module (DDSM) and Domain-Universal Selected Module +(DUSM) to extract the distinct and universal features in two styles +(source-style or target-style). Finally, we fuse these two kinds of features +and feed them into the source-decoder and target-decoder to generate final +predictions. Extensive comparison experiments and analysis on MMOTU image +dataset show that DS2Net can boost the segmentation performance for +bidirectional cross-domain adaptation of 2d ultrasound images and CEUS images. +Our proposed dataset and code are all available at +https://github.com/cv516Buaa/MMOTU_DS2Net. + +
+
+ comment: code: https://github.com/cv516Buaa/MMOTU_DS2Net paper:18 pages, 12 + figures, 11 tables, 16 formulas +
+
+
+
+
+ + ♻ ☆ Learning Disentangled Identifiers for Action-Customized Text-to-Image + Generation + + +
+ This study focuses on a novel task in text-to-image (T2I) generation, namely +action customization. The objective of this task is to learn the co-existing +action from limited data and generalize it to unseen humans or even animals. +Experimental results show that existing subject-driven customization methods +fail to learn the representative characteristics of actions and struggle in +decoupling actions from context features, including appearance. To overcome the +preference for low-level features and the entanglement of high-level features, +we propose an inversion-based method Action-Disentangled Identifier (ADI) to +learn action-specific identifiers from the exemplar images. ADI first expands +the semantic conditioning space by introducing layer-wise identifier tokens, +thereby increasing the representational richness while distributing the +inversion across different features. Then, to block the inversion of +action-agnostic features, ADI extracts the gradient invariance from the +constructed sample triples and masks the updates of irrelevant channels. To +comprehensively evaluate the task, we present an ActionBench that includes a +variety of actions, each accompanied by meticulously selected samples. Both +quantitative and qualitative results show that our ADI outperforms existing +baselines in action-customized T2I generation. Our project page is at +https://adi-t2i.github.io/ADI. + +
+
+
+
+
+ + ♻ ☆ DreamAvatar: Text-and-Shape Guided 3D Human Avatar Generation via + Diffusion Models + + +
+ We present DreamAvatar, a text-and-shape guided framework for generating +high-quality 3D human avatars with controllable poses. While encouraging +results have been reported by recent methods on text-guided 3D common object +generation, generating high-quality human avatars remains an open challenge due +to the complexity of the human body's shape, pose, and appearance. We propose +DreamAvatar to tackle this challenge, which utilizes a trainable NeRF for +predicting density and color for 3D points and pretrained text-to-image +diffusion models for providing 2D self-supervision. Specifically, we leverage +the SMPL model to provide shape and pose guidance for the generation. We +introduce a dual-observation-space design that involves the joint optimization +of a canonical space and a posed space that are related by a learnable +deformation field. This facilitates the generation of more complete textures +and geometry faithful to the target pose. We also jointly optimize the losses +computed from the full body and from the zoomed-in 3D head to alleviate the +common multi-face ''Janus'' problem and improve facial details in the generated +avatars. Extensive evaluations demonstrate that DreamAvatar significantly +outperforms existing methods, establishing a new state-of-the-art for +text-and-shape guided 3D human avatar generation. + +
+
+ comment: Project page: https://yukangcao.github.io/DreamAvatar/ +
+
+
+
+
+ + ♻ ☆ ViC-MAE: Self-Supervised Representation Learning from Images and Video + with Contrastive Masked Autoencoders + + +
+ We propose ViC-MAE, a model that combines both Masked AutoEncoders (MAE) and +contrastive learning. ViC-MAE is trained using a global featured obtained by +pooling the local representations learned under an MAE reconstruction loss and +leveraging this representation under a contrastive objective across images and +video frames. We show that visual representations learned under ViC-MAE +generalize well to both video and image classification tasks. Particularly, +ViC-MAE obtains state-of-the-art transfer learning performance from video to +images on Imagenet-1k compared to the recently proposed OmniMAE by achieving a +top-1 accuracy of 86% (+1.3% absolute improvement) when trained on the same +data and 87.1% (+2.4% absolute improvement) when training on extra data. At the +same time ViC-MAE outperforms most other methods on video benchmarks by +obtaining 75.9% top-1 accuracy on the challenging Something something-v2 video +benchmark . When training on videos and images from a diverse combination of +datasets, our method maintains a balanced transfer-learning performance between +video and image classification benchmarks, coming only as a close second to the +best supervised method. + +
+
+ comment: More results on Video an Image datasets, ViC-MAE now supports + training on videos and images +
+
+
+
+
+ + ♻ ☆ Structured Pruning for Deep Convolutional Neural Networks: A survey + + +
+ The remarkable performance of deep Convolutional neural networks (CNNs) is +generally attributed to their deeper and wider architectures, which can come +with significant computational costs. Pruning neural networks has thus gained +interest since it effectively lowers storage and computational costs. In +contrast to weight pruning, which results in unstructured models, structured +pruning provides the benefit of realistic acceleration by producing models that +are friendly to hardware implementation. The special requirements of structured +pruning have led to the discovery of numerous new challenges and the +development of innovative solutions. This article surveys the recent progress +towards structured pruning of deep CNNs. We summarize and compare the +state-of-the-art structured pruning techniques with respect to filter ranking +methods, regularization methods, dynamic execution, neural architecture search, +the lottery ticket hypothesis, and the applications of pruning. While +discussing structured pruning algorithms, we briefly introduce the unstructured +pruning counterpart to emphasize their differences. Furthermore, we provide +insights into potential research opportunities in the field of structured +pruning. A curated list of neural network pruning papers can be found at +https://github.com/he-y/Awesome-Pruning . A dedicated website offering a more +interactive comparison of structured pruning methods can be found at: +https://huggingface.co/spaces/he-yang/Structured-Pruning-Survey . + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ♻ ☆ Real-World Light Field Image Super-Resolution via Degradation Modulation + + +
+ Recent years have witnessed the great advances of deep neural networks (DNNs) +in light field (LF) image super-resolution (SR). However, existing DNN-based LF +image SR methods are developed on a single fixed degradation (e.g., bicubic +downsampling), and thus cannot be applied to super-resolve real LF images with +diverse degradation. In this paper, we propose a simple yet effective method +for real-world LF image SR. In our method, a practical LF degradation model is +developed to formulate the degradation process of real LF images. Then, a +convolutional neural network is designed to incorporate the degradation prior +into the SR process. By training on LF images using our formulated degradation, +our network can learn to modulate different degradation while incorporating +both spatial and angular information in LF images. Extensive experiments on +both synthetically degraded and real-world LF images demonstrate the +effectiveness of our method. Compared with existing state-of-the-art single and +LF image SR methods, our method achieves superior SR performance under a wide +range of degradation, and generalizes better to real LF images. Codes and +models are available at https://yingqianwang.github.io/LF-DMnet/. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Less is More -- Towards parsimonious multi-task models using structured + sparsity + + +
+ Model sparsification in deep learning promotes simpler, more interpretable +models with fewer parameters. This not only reduces the model's memory +footprint and computational needs but also shortens inference time. This work +focuses on creating sparse models optimized for multiple tasks with fewer +parameters. These parsimonious models also possess the potential to match or +outperform dense models in terms of performance. In this work, we introduce +channel-wise l1/l2 group sparsity in the shared convolutional layers parameters +(or weights) of the multi-task learning model. This approach facilitates the +removal of extraneous groups i.e., channels (due to l1 regularization) and also +imposes a penalty on the weights, further enhancing the learning efficiency for +all tasks (due to l2 regularization). We analyzed the results of group sparsity +in both single-task and multi-task settings on two widely-used Multi-Task +Learning (MTL) datasets: NYU-v2 and CelebAMask-HQ. On both datasets, which +consist of three different computer vision tasks each, multi-task models with +approximately 70% sparsity outperform their dense equivalents. We also +investigate how changing the degree of sparsification influences the model's +performance, the overall sparsity percentage, the patterns of sparsity, and the +inference time. + +
+
+ comment: accepted at First Conference on Parsimony and Learning (CPAL 2024) +
+
+
+
+
+ + ♻ ☆ KOPPA: Improving Prompt-based Continual Learning with Key-Query + Orthogonal Projection and Prototype-based One-Versus-All + + +
+ Drawing inspiration from prompt tuning techniques applied to Large Language +Models, recent methods based on pre-trained ViT networks have achieved +remarkable results in the field of Continual Learning. Specifically, these +approaches propose to maintain a set of prompts and allocate a subset of them +to learn each task using a key-query matching strategy. However, they may +encounter limitations when lacking control over the correlations between old +task queries and keys of future tasks, the shift of features in the latent +space, and the relative separation of latent vectors learned in independent +tasks. In this work, we introduce a novel key-query learning strategy based on +orthogonal projection, inspired by model-agnostic meta-learning, to enhance +prompt matching efficiency and address the challenge of shifting features. +Furthermore, we introduce a One-Versus-All (OVA) prototype-based component that +enhances the classification head distinction. Experimental results on benchmark +datasets demonstrate that our method empowers the model to achieve results +surpassing those of current state-of-the-art approaches by a large margin of up +to 20%. + +
+
+
+
+
+ + ♻ ☆ Differentiable JPEG: The Devil is in the Details WACV 2024 + + +
+ JPEG remains one of the most widespread lossy image coding methods. However, +the non-differentiable nature of JPEG restricts the application in deep +learning pipelines. Several differentiable approximations of JPEG have recently +been proposed to address this issue. This paper conducts a comprehensive review +of existing diff. JPEG approaches and identifies critical details that have +been missed by previous methods. To this end, we propose a novel diff. JPEG +approach, overcoming previous limitations. Our approach is differentiable +w.r.t. the input image, the JPEG quality, the quantization tables, and the +color conversion parameters. We evaluate the forward and backward performance +of our diff. JPEG approach against existing methods. Additionally, extensive +ablations are performed to evaluate crucial design choices. Our proposed diff. +JPEG resembles the (non-diff.) reference implementation best, significantly +surpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For +strong compression rates, we can even improve PSNR by $9.51$dB. Strong +adversarial attack results are yielded by our diff. JPEG, demonstrating the +effective gradient approximation. Our code is available at +https://github.com/necla-ml/Diff-JPEG. + +
+
+ comment: Accepted at WACV 2024. Project page: + https://christophreich1996.github.io/differentiable_jpeg/ +
+
+
+
+
+ + ♻ ☆ Generative-based Fusion Mechanism for Multi-Modal Tracking + + +
+ Generative models (GMs) have received increasing research interest for their +remarkable capacity to achieve comprehensive understanding. However, their +potential application in the domain of multi-modal tracking has remained +relatively unexplored. In this context, we seek to uncover the potential of +harnessing generative techniques to address the critical challenge, information +fusion, in multi-modal tracking. In this paper, we delve into two prominent GM +techniques, namely, Conditional Generative Adversarial Networks (CGANs) and +Diffusion Models (DMs). Different from the standard fusion process where the +features from each modality are directly fed into the fusion block, we +condition these multi-modal features with random noise in the GM framework, +effectively transforming the original training samples into harder instances. +This design excels at extracting discriminative clues from the features, +enhancing the ultimate tracking performance. To quantitatively gauge the +effectiveness of our approach, we conduct extensive experiments across two +multi-modal tracking tasks, three baseline methods, and three challenging +benchmarks. The experimental results demonstrate that the proposed +generative-based fusion mechanism achieves state-of-the-art performance, +setting new records on LasHeR and RGBD1K. + +
+
+
+
+
+ + ♻ ☆ EXIM: A Hybrid Explicit-Implicit Representation for Text-Guided 3D Shape + Generation SIGGRAPH + + +
+ This paper presents a new text-guided technique for generating 3D shapes. The +technique leverages a hybrid 3D shape representation, namely EXIM, combining +the strengths of explicit and implicit representations. Specifically, the +explicit stage controls the topology of the generated 3D shapes and enables +local modifications, whereas the implicit stage refines the shape and paints it +with plausible colors. Also, the hybrid approach separates the shape and color +and generates color conditioned on shape to ensure shape-color consistency. +Unlike the existing state-of-the-art methods, we achieve high-fidelity shape +generation from natural-language descriptions without the need for +time-consuming per-shape optimization or reliance on human-annotated texts +during training or test-time optimization. Further, we demonstrate the +applicability of our approach to generate indoor scenes with consistent styles +using text-induced 3D shapes. Through extensive experiments, we demonstrate the +compelling quality of our results and the high coherency of our generated +shapes with the input texts, surpassing the performance of existing methods by +a significant margin. Codes and models are released at +https://github.com/liuzhengzhe/EXIM. + +
+
+ comment: SIGGRAPH Asia 2023 & TOG Project page: + https://liuzhengzhe.github.io/EXIM.github.io/ +
+
+
+
+
+ + ♻ ☆ Extending Explainable Boosting Machines to Scientific Image Data + + +
+ As the deployment of computer vision technology becomes increasingly common +in science, the need for explanations of the system and its output has become a +focus of great concern. Driven by the pressing need for interpretable models in +science, we propose the use of Explainable Boosting Machines (EBMs) for +scientific image data. Inspired by an important application underpinning the +development of quantum technologies, we apply EBMs to cold-atom soliton image +data tabularized using Gabor Wavelet Transform-based techniques that preserve +the spatial structure of the data. In doing so, we demonstrate the use of EBMs +for image data for the first time and show that our approach provides +explanations that are consistent with human intuition about the data. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Adapter is All You Need for Tuning Visual Tasks + + +
+ Pre-training & fine-tuning can enhance the transferring efficiency and +performance in visual tasks. Recent delta-tuning methods provide more options +for visual classification tasks. Despite their success, existing visual +delta-tuning art fails to exceed the upper limit of full fine-tuning on +challenging tasks like instance segmentation and semantic segmentation. To find +a competitive alternative to full fine-tuning, we propose the Multi-cognitive +Visual Adapter (Mona) tuning, a novel adapter-based tuning method. First, we +introduce multiple vision-friendly filters into the adapter to enhance its +ability to process visual signals, while previous methods mainly rely on +language-friendly linear filters. Second, we add the scaled normalization layer +in the adapter to regulate the distribution of input features for visual +filters. To fully demonstrate the practicality and generality of Mona, we +conduct experiments on multiple representative visual tasks, including instance +segmentation on COCO, semantic segmentation on ADE20K, object detection on +Pascal VOC, and image classification on several common datasets. Exciting +results illustrate that Mona surpasses full fine-tuning on all these tasks and +is the only delta-tuning method outperforming full fine-tuning on instance +segmentation and semantic segmentation tasks. For example, Mona achieves a 1% +performance gain on the COCO dataset compared to full fine-tuning. +Comprehensive results suggest that Mona-tuning is more suitable for retaining +and utilizing the capabilities of pre-trained models than full fine-tuning. The +code will be released at https://github.com/Leiyi-Hu/mona. + +
+
+
+
+
+ + ♻ ☆ CatVersion: Concatenating Embeddings for Diffusion-Based Text-to-Image + Personalization + + +
+ We propose CatVersion, an inversion-based method that learns the personalized +concept through a handful of examples. Subsequently, users can utilize text +prompts to generate images that embody the personalized concept, thereby +achieving text-to-image personalization. In contrast to existing approaches +that emphasize word embedding learning or parameter fine-tuning for the +diffusion model, which potentially causes concept dilution or overfitting, our +method concatenates embeddings on the feature-dense space of the text encoder +in the diffusion model to learn the gap between the personalized concept and +its base class, aiming to maximize the preservation of prior knowledge in +diffusion models while restoring the personalized concepts. To this end, we +first dissect the text encoder's integration in the image generation process to +identify the feature-dense space of the encoder. Afterward, we concatenate +embeddings on the Keys and Values in this space to learn the gap between the +personalized concept and its base class. In this way, the concatenated +embeddings ultimately manifest as a residual on the original attention output. +To more accurately and unbiasedly quantify the results of personalized image +generation, we improve the CLIP image alignment score based on masks. +Qualitatively and quantitatively, CatVersion helps to restore personalization +concepts more faithfully and enables more robust editing. + +
+
+ comment: For the project page, please visit + https://royzhao926.github.io/CatVersion-page/ +
+
+
+
+
+ + ♻ ☆ Diffusion Models for Imperceptible and Transferable Adversarial Attack + + +
+ Many existing adversarial attacks generate $L_p$-norm perturbations on image +RGB space. Despite some achievements in transferability and attack success +rate, the crafted adversarial examples are easily perceived by human eyes. +Towards visual imperceptibility, some recent works explore unrestricted attacks +without $L_p$-norm constraints, yet lacking transferability of attacking +black-box models. In this work, we propose a novel imperceptible and +transferable attack by leveraging both the generative and discriminative power +of diffusion models. Specifically, instead of direct manipulation in pixel +space, we craft perturbations in the latent space of diffusion models. Combined +with well-designed content-preserving structures, we can generate +human-insensitive perturbations embedded with semantic clues. For better +transferability, we further "deceive" the diffusion model which can be viewed +as an implicit recognition surrogate, by distracting its attention away from +the target regions. To our knowledge, our proposed method, DiffAttack, is the +first that introduces diffusion models into the adversarial attack field. +Extensive experiments on various model structures, datasets, and defense +methods have demonstrated the superiority of our attack over the existing +attack methods. + +
+
+ comment: Code Page: https://github.com/WindVChen/DiffAttack. In Paper Version + v2, we incorporate more discussions and experiments +
+
+
+
+
+ + ♻ ☆ Assessment of Deep Learning Segmentation for Real-Time Free-Breathing + Cardiac Magnetic Resonance Imaging + + +
+ In recent years, a variety of deep learning networks for cardiac MRI (CMR) +segmentation have been developed and analyzed. However, nearly all of them are +focused on cine CMR under breathold. In this work, accuracy of deep learning +methods is assessed for volumetric analysis (via segmentation) of the left +ventricle in real-time free-breathing CMR at rest and under exercise stress. +Data from healthy volunteers (n=15) for cine and real-time free-breathing CMR +were analyzed retrospectively. Segmentations of a commercial software (comDL) +and a freely available neural network (nnU-Net), were compared to a reference +created via the manual correction of comDL segmentation. Segmentation of left +ventricular endocardium (LV), left ventricular myocardium (MYO), and right +ventricle (RV) is evaluated for both end-systolic and end-diastolic phases and +analyzed with Dice's coefficient (DC). The volumetric analysis includes LV +end-diastolic volume (EDV), LV end-systolic volume (ESV), and LV ejection +fraction (EF). For cine CMR, nnU-Net and comDL achieve a DC above 0.95 for LV +and 0.9 for MYO, and RV. For real-time CMR, the accuracy of nnU-Net exceeds +that of comDL overall. For real-time CMR at rest, nnU-Net achieves a DC of 0.94 +for LV, 0.89 for MYO, and 0.90 for RV; mean absolute differences between +nnU-Net and reference are 2.9mL for EDV, 3.5mL for ESV and 2.6% for EF. For +real-time CMR under exercise stress, nnU-Net achieves a DC of 0.92 for LV, 0.85 +for MYO, and 0.83 for RV; mean absolute differences between nnU-Net and +reference are 11.4mL for EDV, 2.9mL for ESV and 3.6% for EF. Deep learning +methods designed or trained for cine CMR segmentation can perform well on +real-time CMR. For real-time free-breathing CMR at rest, the performance of +deep learning methods is comparable to inter-observer variability in cine CMR +and is usable or fully automatic segmentation. + +
+
+ comment: Martin Schilling and Christina Unterberg-Buchwald contributed equally + to this work +
+
+
+
+
+ + ♻ ☆ Compositor: Bottom-up Clustering and Compositing for Robust Part and + Object Segmentation + + +
+ In this work, we present a robust approach for joint part and object +segmentation. Specifically, we reformulate object and part segmentation as an +optimization problem and build a hierarchical feature representation including +pixel, part, and object-level embeddings to solve it in a bottom-up clustering +manner. Pixels are grouped into several clusters where the part-level +embeddings serve as cluster centers. Afterwards, object masks are obtained by +compositing the part proposals. This bottom-up interaction is shown to be +effective in integrating information from lower semantic levels to higher +semantic levels. Based on that, our novel approach Compositor produces part and +object segmentation masks simultaneously while improving the mask quality. +Compositor achieves state-of-the-art performance on PartImageNet and +Pascal-Part by outperforming previous methods by around 0.9% and 1.3% on +PartImageNet, 0.4% and 1.7% on Pascal-Part in terms of part and object mIoU and +demonstrates better robustness against occlusion by around 4.4% and 7.1% on +part and object respectively. Code will be available at +https://github.com/TACJu/Compositor. + +
+
+
+
+
+ + ♻ ☆ Monocular Camera Localization for Automated Vehicles Using Image + Retrieval + + +
+ We address the problem of finding the current position and heading angle of +an autonomous vehicle in real-time using a single camera. Compared to methods +which require LiDARs and high definition (HD) 3D maps in real-time, the +proposed approach is easily scalable and computationally efficient, at the +price of lower precision. + The new method combines and adapts existing algorithms in three different +fields: image retrieval, mapping database, and particle filtering. The result +is a simple, real-time localization method using an image retrieval method +whose performance is comparable to other monocular camera localization methods +which use a map built with LiDARs. + We evaluate the proposed method using the KITTI odometry dataset and via +closed-loop experiments with an indoor 1:10 autonomous vehicle. The tests +demonstrate real-time capability and a 10cm level accuracy. Also, experimental +results of the closed-loop indoor tests show the presence of a positive +feedback loop between the localization error and the control error. Such +phenomena is analysed in details at the end of the article. + +
+
+
+
+
+ + ♻ ☆ Caterpillar: A Pure-MLP Architecture with Shifted-Pillars-Concatenation + + +
+ Modeling in Computer Vision has evolved to MLPs. Vision MLPs naturally lack +local modeling capability, to which the simplest treatment is combined with +convolutional layers. Convolution, famous for its sliding window scheme, also +suffers from this scheme of redundancy and low computational efficiency. In +this paper, we seek to dispense with the windowing scheme and introduce a more +elaborate and effective approach to exploiting locality. To this end, we +propose a new MLP module, namely Shifted-Pillars-Concatenation (SPC), that +consists of two steps of processes: (1) Pillars-Shift, which generates four +neighboring maps by shifting the input image along four directions, and (2) +Pillars-Concatenation, which applies linear transformations and concatenation +on the maps to aggregate local features. SPC module offers superior local +modeling power and performance gains, making it a promising alternative to the +convolutional layer. Then, we build a pure-MLP architecture called Caterpillar +by replacing the convolutional layer with the SPC module in a hybrid model of +sMLPNet. Extensive experiments show Caterpillar's excellent performance and +scalability on both ImageNet-1K and small-scale classification benchmarks. + +
+
+
+
+
+ + ♻ ☆ ClothCombo: Modeling Inter-Cloth Interaction for Draping Multi-Layered + Clothes + + +
+ We present ClothCombo, a pipeline to drape arbitrary combinations of clothes +on 3D human models with varying body shapes and poses. While existing +learning-based approaches for draping clothes have shown promising results, +multi-layered clothing remains challenging as it is non-trivial to model +inter-cloth interaction. To this end, our method utilizes a GNN-based network +to efficiently model the interaction between clothes in different layers, thus +enabling multi-layered clothing. Specifically, we first create feature +embedding for each cloth using a topology-agnostic network. Then, the draping +network deforms all clothes to fit the target body shape and pose without +considering inter-cloth interaction. Lastly, the untangling network predicts +the per-vertex displacements in a way that resolves interpenetration between +clothes. In experiments, the proposed model demonstrates strong performance in +complex multi-layered scenarios. Being agnostic to cloth topology, our method +can be readily used for layered virtual try-on of real clothes in diverse poses +and combinations of clothes. + +
+
+
+
+
+ + ♻ ☆ Recent Advances of Continual Learning in Computer Vision: An Overview + + +
+ In contrast to batch learning where all training data is available at once, +continual learning represents a family of methods that accumulate knowledge and +learn continuously with data available in sequential order. Similar to the +human learning process with the ability of learning, fusing, and accumulating +new knowledge coming at different time steps, continual learning is considered +to have high practical significance. Hence, continual learning has been studied +in various artificial intelligence tasks. In this paper, we present a +comprehensive review of the recent progress of continual learning in computer +vision. In particular, the works are grouped by their representative +techniques, including regularization, knowledge distillation, memory, +generative replay, parameter isolation, and a combination of the above +techniques. For each category of these techniques, both its characteristics and +applications in computer vision are presented. At the end of this overview, +several subareas, where continuous knowledge accumulation is potentially +helpful while continual learning has not been well studied, are discussed. + +
+
+
+
+
+ + ♻ ☆ Check, Locate, Rectify: A Training-Free Layout Calibration System for + Text-to-Image Generation + + +
+ Diffusion models have recently achieved remarkable progress in generating +realistic images. However, challenges remain in accurately understanding and +synthesizing the layout requirements in the textual prompts. To align the +generated image with layout instructions, we present a training-free layout +calibration system SimM that intervenes in the generative process on the fly +during inference time. Specifically, following a "check-locate-rectify" +pipeline, the system first analyses the prompt to generate the target layout +and compares it with the intermediate outputs to automatically detect errors. +Then, by moving the located activations and making intra- and inter-map +adjustments, the rectification process can be performed with negligible +computational overhead. To evaluate SimM over a range of layout requirements, +we present a benchmark SimMBench that compensates for the lack of superlative +spatial relations in existing datasets. And both quantitative and qualitative +results demonstrate the effectiveness of the proposed SimM in calibrating the +layout inconsistencies. Our project page is at https://simm-t2i.github.io/SimM. + +
+
+
+
+
+ + ♻ ☆ Dense Pixel-to-Pixel Harmonization via Continuous Image Representation + + +
+ High-resolution (HR) image harmonization is of great significance in +real-world applications such as image synthesis and image editing. However, due +to the high memory costs, existing dense pixel-to-pixel harmonization methods +are mainly focusing on processing low-resolution (LR) images. Some recent works +resort to combining with color-to-color transformations but are either limited +to certain resolutions or heavily depend on hand-crafted image filters. In this +work, we explore leveraging the implicit neural representation (INR) and +propose a novel image Harmonization method based on Implicit neural Networks +(HINet), which to the best of our knowledge, is the first dense pixel-to-pixel +method applicable to HR images without any hand-crafted filter design. Inspired +by the Retinex theory, we decouple the MLPs into two parts to respectively +capture the content and environment of composite images. A Low-Resolution Image +Prior (LRIP) network is designed to alleviate the Boundary Inconsistency +problem, and we also propose new designs for the training and inference +process. Extensive experiments have demonstrated the effectiveness of our +method compared with state-of-the-art methods. Furthermore, some interesting +and practical applications of the proposed method are explored. Our code is +available at https://github.com/WindVChen/INR-Harmonization. + +
+
+ comment: Accepted by IEEE Transactions on Circuits and Systems for Video + Technology (TCSVT) +
+
+
+
+
+ + ♻ ☆ Fast-ParC: Capturing Position Aware Global Feature for ConvNets and ViTs ECCV 2022 + + +
+ Transformer models have made tremendous progress in various fields in recent +years. In the field of computer vision, vision transformers (ViTs) also become +strong alternatives to convolutional neural networks (ConvNets), yet they have +not been able to replace ConvNets since both have their own merits. For +instance, ViTs are good at extracting global features with attention mechanisms +while ConvNets are more efficient in modeling local relationships due to their +strong inductive bias. A natural idea that arises is to combine the strengths +of both ConvNets and ViTs to design new structures. In this paper, we propose a +new basic neural network operator named position-aware circular convolution +(ParC) and its accelerated version Fast-ParC. The ParC operator can capture +global features by using a global kernel and circular convolution while keeping +location sensitiveness by employing position embeddings. Our Fast-ParC further +reduces the O(n2) time complexity of ParC to O(n log n) using Fast Fourier +Transform. This acceleration makes it possible to use global convolution in the +early stages of models with large feature maps, yet still maintains the overall +computational cost comparable with using 3x3 or 7x7 kernels. The proposed +operation can be used in a plug-and-play manner to 1) convert ViTs to +pure-ConvNet architecture to enjoy wider hardware support and achieve higher +inference speed; 2) replacing traditional convolutions in the deep stage of +ConvNets to improve accuracy by enlarging the effective receptive field. +Experiment results show that our ParC op can effectively enlarge the receptive +field of traditional ConvNets, and adopting the proposed op benefits both ViTs +and ConvNet models on all three popular vision tasks, image classification, +object + +
+
+ comment: 22 pages, 8 figures, 10 tables. A preliminary version of this paper + has been published in ECCV 2022 and it can be find in arXiv:2203.03952 +
+
+
+
+
+ + ♻ ☆ SPiC-E : Structural Priors in 3D Diffusion Models using Cross-Entity + Attention + + +
+ We are witnessing rapid progress in automatically generating and manipulating +3D assets due to the availability of pretrained text-image diffusion models. +However, time-consuming optimization procedures are required for synthesizing +each sample, hindering their potential for democratizing 3D content creation. +Conversely, 3D diffusion models now train on million-scale 3D datasets, +yielding high-quality text-conditional 3D samples within seconds. In this work, +we present SPiC-E - a neural network that adds structural guidance to 3D +diffusion models, extending their usage beyond text-conditional generation. At +its core, our framework introduces a cross-entity attention mechanism that +allows for multiple entities (in particular, paired input and guidance 3D +shapes) to interact via their internal representations within the denoising +network. We utilize this mechanism for learning task-specific structural priors +in 3D diffusion models from auxiliary guidance shapes. We show that our +approach supports a variety of applications, including 3D stylization, semantic +shape editing and text-conditional abstraction-to-3D, which transforms +primitive-based abstractions into highly-expressive shapes. Extensive +experiments demonstrate that SPiC-E achieves SOTA performance over these tasks +while often being considerably faster than alternative methods. Importantly, +this is accomplished without tailoring our approach for any specific task. + +
+
+ comment: Project webpage: https://tau-vailab.github.io/spic-e +
+
+
+
+
+ + ♻ ☆ AnyText: Multilingual Visual Text Generation And Editing + + +
+ Diffusion model based Text-to-Image has achieved impressive achievements +recently. Although current technology for synthesizing images is highly +advanced and capable of generating images with high fidelity, it is still +possible to give the show away when focusing on the text area in the generated +image. To address this issue, we introduce AnyText, a diffusion-based +multilingual visual text generation and editing model, that focuses on +rendering accurate and coherent text in the image. AnyText comprises a +diffusion pipeline with two primary elements: an auxiliary latent module and a +text embedding module. The former uses inputs like text glyph, position, and +masked image to generate latent features for text generation or editing. The +latter employs an OCR model for encoding stroke data as embeddings, which blend +with image caption embeddings from the tokenizer to generate texts that +seamlessly integrate with the background. We employed text-control diffusion +loss and text perceptual loss for training to further enhance writing accuracy. +AnyText can write characters in multiple languages, to the best of our +knowledge, this is the first work to address multilingual visual text +generation. It is worth mentioning that AnyText can be plugged into existing +diffusion models from the community for rendering or editing text accurately. +After conducting extensive evaluation experiments, our method has outperformed +all other approaches by a significant margin. Additionally, we contribute the +first large-scale multilingual text images dataset, AnyWord-3M, containing 3 +million image-text pairs with OCR annotations in multiple languages. Based on +AnyWord-3M dataset, we propose AnyText-benchmark for the evaluation of visual +text generation accuracy and quality. Our project will be open-sourced on +https://github.com/tyxsspa/AnyText to improve and promote the development of +text generation technology. + +
+
+
+
+
+ + ♻ ☆ Enhancing Point Annotations with Superpixel and Confidence Learning + Guided for Improving Semi-Supervised OCT Fluid Segmentation SP + + +
+ Automatic segmentation of fluid in Optical Coherence Tomography (OCT) images +is beneficial for ophthalmologists to make an accurate diagnosis. Although +semi-supervised OCT fluid segmentation networks enhance their performance by +introducing additional unlabeled data, the performance enhancement is limited. +To address this, we propose Superpixel and Confident Learning Guide Point +Annotations Network (SCLGPA-Net) based on the teacher-student architecture, +which can learn OCT fluid segmentation from limited fully-annotated data and +abundant point-annotated data. Specifically, we use points to annotate fluid +regions in unlabeled OCT images and the Superpixel-Guided Pseudo-Label +Generation (SGPLG) module generates pseudo-labels and pixel-level label trust +maps from the point annotations. The label trust maps provide an indication of +the reliability of the pseudo-labels. Furthermore, we propose the Confident +Learning Guided Label Refinement (CLGLR) module identifies error information in +the pseudo-labels and leads to further refinement. Experiments on the RETOUCH +dataset show that we are able to reduce the need for fully-annotated data by +94.22\%, closing the gap with the best fully supervised baselines to a mean IoU +of only 2\%. Furthermore, We constructed a private 2D OCT fluid segmentation +dataset for evaluation. Compared with other methods, comprehensive experimental +results demonstrate that the proposed method can achieve excellent performance +in OCT fluid segmentation. + +
+
+ comment: Submission to BSPC +
+
+
+
+
+ + ♻ ☆ Cascade Learning Localises Discriminant Features in Visual Scene + Classification + + +
+ Lack of interpretability of deep convolutional neural networks (DCNN) is a +well-known problem particularly in the medical domain as clinicians want +trustworthy automated decisions. One way to improve trust is to demonstrate the +localisation of feature representations with respect to expert labeled regions +of interest. In this work, we investigate the localisation of features learned +via two varied learning paradigms and demonstrate the superiority of one +learning approach with respect to localisation. Our analysis on medical and +natural datasets show that the traditional end-to-end (E2E) learning strategy +has a limited ability to localise discriminative features across multiple +network layers. We show that a layer-wise learning strategy, namely cascade +learning (CL), results in more localised features. Considering localisation +accuracy, we not only show that CL outperforms E2E but that it is a promising +method of predicting regions. On the YOLO object detection framework, our best +result shows that CL outperforms the E2E scheme by $2\%$ in mAP. + +
+
+
+
+
+ + ♻ ☆ MDCS: More Diverse Experts with Consistency Self-distillation for + Long-tailed Recognition ICCV2023 + + +
+ Recently, multi-expert methods have led to significant improvements in +long-tail recognition (LTR). We summarize two aspects that need further +enhancement to contribute to LTR boosting: (1) More diverse experts; (2) Lower +model variance. However, the previous methods didn't handle them well. To this +end, we propose More Diverse experts with Consistency Self-distillation (MDCS) +to bridge the gap left by earlier methods. Our MDCS approach consists of two +core components: Diversity Loss (DL) and Consistency Self-distillation (CS). In +detail, DL promotes diversity among experts by controlling their focus on +different categories. To reduce the model variance, we employ KL divergence to +distill the richer knowledge of weakly augmented instances for the experts' +self-distillation. In particular, we design Confident Instance Sampling (CIS) +to select the correctly classified instances for CS to avoid biased/noisy +knowledge. In the analysis and ablation study, we demonstrate that our method +compared with previous work can effectively increase the diversity of experts, +significantly reduce the variance of the model, and improve recognition +accuracy. Moreover, the roles of our DL and CS are mutually reinforcing and +coupled: the diversity of experts benefits from the CS, and the CS cannot +achieve remarkable results without the DL. Experiments show our MDCS +outperforms the state-of-the-art by 1% $\sim$ 2% on five popular long-tailed +benchmarks, including CIFAR10-LT, CIFAR100-LT, ImageNet-LT, Places-LT, and +iNaturalist 2018. The code is available at https://github.com/fistyee/MDCS. + +
+
+ comment: ICCV2023 Accept. 13 pages +
+
+
+
+
+ + ♻ ☆ Generating More Pertinent Captions by Leveraging Semantics and Style on + Multi-Source Datasets + + +
+ This paper addresses the task of generating fluent descriptions by training +on a non-uniform combination of data sources, containing both human-annotated +and web-collected captions. Large-scale datasets with noisy image-text pairs, +indeed, provide a sub-optimal source of supervision because of their +low-quality descriptive style, while human-annotated datasets are cleaner but +smaller in scale. To get the best of both worlds, we propose to leverage and +separate semantics and descriptive style through the incorporation of a style +token and keywords extracted through a retrieval component. The proposed model +avoids the need of object detectors, is trained with a single objective of +prompt language modeling, and can replicate the style of human-collected +captions while training on sources with different input styles. Experimentally, +the model shows a strong capability of recognizing real-world concepts and +producing high-quality captions. Extensive experiments are performed on +different image captioning datasets, including CC3M, nocaps, and the +competitive COCO dataset, where our model consistently outperforms baselines +and state-of-the-art approaches. + +
+
+ comment: Accepted to IJCV +
+
+
+
+
+ + ♻ ☆ DAP: Domain-aware Prompt Learning for Vision-and-Language Navigation + + +
+ Following language instructions to navigate in unseen environments is a +challenging task for autonomous embodied agents. With strong representation +capabilities, pretrained vision-and-language models are widely used in VLN. +However, most of them are trained on web-crawled general-purpose datasets, +which incurs a considerable domain gap when used for VLN tasks. To address the +problem, we propose a novel and model-agnostic domain-aware prompt learning +(DAP) framework. For equipping the pretrained models with specific object-level +and scene-level cross-modal alignment in VLN tasks, DAP applies a low-cost +prompt tuning paradigm to learn soft visual prompts for extracting in-domain +image semantics. Specifically, we first generate a set of in-domain image-text +pairs with the help of the CLIP model. Then we introduce soft visual prompts in +the input space of the visual encoder in a pretrained model. DAP injects +in-domain visual knowledge into the visual encoder of the pretrained model in +an efficient way. Experimental results on both R2R and REVERIE show the +superiority of DAP compared to existing state-of-the-art methods. + +
+
+ comment: 4 pages. arXiv admin note: substantial text overlap with + arXiv:2309.03661 +
+
+
+
+
+ + ♻ ☆ Prompt-based Context- and Domain-aware Pretraining for Vision and + Language Navigation + + +
+ With strong representation capabilities, pretrained vision-language models +are widely used in vision and language navigation (VLN). However, most of them +are trained on web-crawled general-purpose datasets, which incurs a +considerable domain gap when used for VLN tasks. Another challenge for VLN is +how the agent understands the contextual relations between actions on a +trajectory and performs cross-modal alignment sequentially. In this paper, we +propose a novel Prompt-bAsed coNtext- and Domain-Aware (PANDA) pretraining +framework to address these problems. It performs prompting in two stages. In +the domain-aware stage, we apply a low-cost prompt tuning paradigm to learn +soft visual prompts from an in-domain dataset for equipping the pretrained +models with object-level and scene-level cross-modal alignment in VLN tasks. +Furthermore, in the context-aware stage, we design a set of hard context +prompts to capture the sequence-level semantics and instill both out-of-context +and contextual knowledge in the instruction into cross-modal representations. +They enable further tuning of the pretrained models via contrastive learning. +Experimental results on both R2R and REVERIE show the superiority of PANDA +compared to previous state-of-the-art methods. + +
+
+ comment: the paper has some wrong,and we hope withdrawal it +
+
+
+
+
+ + ♻ ☆ SACReg: Scene-Agnostic Coordinate Regression for Visual Localization + + +
+ Scene coordinates regression (SCR), i.e., predicting 3D coordinates for every +pixel of a given image, has recently shown promising potential. However, +existing methods remain limited to small scenes memorized during training, and +thus hardly scale to realistic datasets and scenarios. In this paper, we +propose a generalized SCR model trained once to be deployed in new test scenes, +regardless of their scale, without any finetuning. Instead of encoding the +scene coordinates into the network weights, our model takes as input a database +image with some sparse 2D pixel to 3D coordinate annotations, extracted from +e.g. off-the-shelf Structure-from-Motion or RGB-D data, and a query image for +which are predicted a dense 3D coordinate map and its confidence, based on +cross-attention. At test time, we rely on existing off-the-shelf image +retrieval systems and fuse the predictions from a shortlist of relevant +database images w.r.t. the query. Afterwards camera pose is obtained using +standard Perspective-n-Point (PnP). Starting from selfsupervised CroCo +pretrained weights, we train our model on diverse datasets to ensure +generalizabilty across various scenarios, and significantly outperform other +scene regression approaches, including scene-specific models, on multiple +visual localization benchmarks. Finally, we show that the database +representation of images and their 2D-3D annotations can be highly compressed +with negligible loss of localization performance. + +
+
+
+
+
+ + ♻ ☆ Point-DAE: Denoising Autoencoders for Self-supervised Point Cloud + Learning + + +
+ Masked autoencoder has demonstrated its effectiveness in self-supervised +point cloud learning. Considering that masking is a kind of corruption, in this +work we explore a more general denoising autoencoder for point cloud learning +(Point-DAE) by investigating more types of corruptions beyond masking. +Specifically, we degrade the point cloud with certain corruptions as input, and +learn an encoder-decoder model to reconstruct the original point cloud from its +corrupted version. Three corruption families (\ie, density/masking, noise, and +affine transformation) and a total of fourteen corruption types are +investigated with traditional non-Transformer encoders. Besides the popular +masking corruption, we identify another effective corruption family, \ie, +affine transformation. The affine transformation disturbs all points globally, +which is complementary to the masking corruption where some local regions are +dropped. We also validate the effectiveness of affine transformation corruption +with the Transformer backbones, where we decompose the reconstruction of the +complete point cloud into the reconstructions of detailed local patches and +rough global shape, alleviating the position leakage problem in the +reconstruction. Extensive experiments on tasks of object classification, +few-shot learning, robustness testing, part segmentation, and 3D object +detection validate the effectiveness of the proposed method. The codes are +available at \url{https://github.com/YBZh/Point-DAE}. + +
+
+ comment: The codes are available at \url{https://github.com/YBZh/Point-DAE} +
+
+
+
+
+ + ♻ ☆ Unsupervised Discovery of Interpretable Directions in h-space of + Pre-trained Diffusion Models + + +
+ We propose the first unsupervised and learning-based method to identify +interpretable directions in h-space of pre-trained diffusion models. Our method +is derived from an existing technique that operates on the GAN latent space. +Specifically, we employ a shift control module that works on h-space of +pre-trained diffusion models to manipulate a sample into a shifted version of +itself, followed by a reconstructor to reproduce both the type and the strength +of the manipulation. By jointly optimizing them, the model will spontaneously +discover disentangled and interpretable directions. To prevent the discovery of +meaningless and destructive directions, we employ a discriminator to maintain +the fidelity of shifted sample. Due to the iterative generative process of +diffusion models, our training requires a substantial amount of GPU VRAM to +store numerous intermediate tensors for back-propagating gradient. To address +this issue, we propose a general VRAM-efficient training algorithm based on +gradient checkpointing technique to back-propagate any gradient through the +whole generative process, with acceptable occupancy of VRAM and sacrifice of +training efficiency. Compared with existing related works on diffusion models, +our method inherently identifies global and scalable directions, without +necessitating any other complicated procedures. Extensive experiments on +various datasets demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Dual-stream contrastive predictive network with joint handcrafted + feature view for SAR ship classification ICASSP2024 + + +
+ Most existing synthetic aperture radar (SAR) ship classification technologies +heavily rely on correctly labeled data, ignoring the discriminative features of +unlabeled SAR ship images. Even though researchers try to enrich CNN-based +features by introducing traditional handcrafted features, existing methods +easily cause information redundancy and fail to capture the interaction between +them. To address these issues, we propose a novel dual-stream contrastive +predictive network (DCPNet), which consists of two asymmetric task designs and +the false negative sample elimination module. The first task is to construct +positive sample pairs, guiding the core encoder to learn more general +representations. The second task is to encourage adaptive capture of the +correspondence between deep features and handcrated features, achieving +knowledge transfer within the model, and effectively improving the redundancy +caused by the feature fusion. To increase the separability between clusters, we +also design a cluster-level tasks. The experimental results on OpenSARShip and +FUSAR-Ship datasets demonstrate the improvement in classification accuracy of +supervised models and confirm the capability of learning effective +representations of DCPNet. + +
+
+ comment: 6 pages, 3 figures, ICASSP2024 +
+
+
+
+
+ + ♻ ☆ Language Models as Black-Box Optimizers for Vision-Language Models + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities on downstream tasks when fine-tuned with +minimal data. However, many VLMs rely on proprietary data and are not +open-source, which restricts the use of white-box approaches for fine-tuning. +As such, we aim to develop a black-box approach to optimize VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or even output logits. We propose employing chat-based LLMs +to search for the best text prompt for VLMs. Specifically, we adopt an +automatic hill-climbing procedure that converges to an effective prompt by +evaluating the performance of current prompts and asking LLMs to refine them +based on textual feedback, all within a conversational process without +human-in-the-loop. In a challenging 1-shot image classification setup, our +simple approach surpasses the white-box continuous prompting method (CoOp) by +an average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms both human-engineered and LLM-generated prompts. We highlight the +advantage of conversational feedback that incorporates both positive and +negative prompts, suggesting that LLMs can utilize the implicit gradient +direction in textual feedback for a more efficient search. In addition, we find +that the text prompts generated through our strategy are not only more +interpretable but also transfer well across different VLM architectures in a +black-box manner. Lastly, we demonstrate our framework on a state-of-the-art +black-box VLM (DALL-E 3) for text-to-image optimization. + +
+
+ comment: Project site: llm-can-optimize-vlm.github.io +
+
+
+
+
+ + ♻ ☆ Image retrieval outperforms diffusion models on data augmentation + + +
+ Many approaches have been proposed to use diffusion models to augment +training datasets for downstream tasks, such as classification. However, +diffusion models are themselves trained on large datasets, often with noisy +annotations, and it remains an open question to which extent these models +contribute to downstream classification performance. In particular, it remains +unclear if they generalize enough to improve over directly using the additional +data of their pre-training process for augmentation. We systematically evaluate +a range of existing methods to generate images from diffusion models and study +new extensions to assess their benefit for data augmentation. Personalizing +diffusion models towards the target data outperforms simpler prompting +strategies. However, using the pre-training data of the diffusion model alone, +via a simple nearest-neighbor retrieval procedure, leads to even stronger +downstream performance. Our study explores the potential of diffusion models in +generating new training data, and surprisingly finds that these sophisticated +models are not yet able to beat a simple and strong image retrieval baseline on +simple downstream vision tasks. + +
+
+
+
+
+ + ♻ ☆ Local Low-light Image Enhancement via Region-Aware Normalization + + +
+ In the realm of Low-Light Image Enhancement (LLIE), existing research +primarily focuses on enhancing images globally. However, many applications +require local LLIE, where users are allowed to illuminate specific regions +using an input mask, such as creating a protagonist stage or spotlight effect. +However, this task has received limited attention currently. This paper aims to +systematically define the requirements of local LLIE and proposes a novel +strategy to convert current existing global LLIE methods into local versions. +The image space is divided into three regions: Masked Area A be enlightened to +achieve the desired lighting effects; Transition Area B is a smooth transition +from the enlightened area (Area A) to the unchanged region (Area C). To achieve +the task of local LLIE, we introduce Region-Aware Normalization for Local +Enhancement, dubbed as RANLEN. RANLEN uses a dynamically designed mask-based +normalization operation, which enhances an image in a spatially varying manner, +ensuring that the enhancement results are consistent with the requirements +specified by the input mask. Additionally, a set of region-aware loss terms is +formulated to facilitate the learning of the local LLIE framework. Our strategy +can be applied to existing global LLIE networks with varying structures. +Extensive experiments demonstrate that our approach can produce the desired +lighting effects compared to global LLIE, all the while offering controllable +local enhancement with various mask shapes. + +
+
+
+
+
+ + ♻ ☆ CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD + Programs + + +
+ CAD programs are a popular way to compactly encode shapes as a sequence of +operations that are easy to parametrically modify. However, without sufficient +semantic comments and structure, such programs can be challenging to +understand, let alone modify. We introduce the problem of semantic commenting +CAD programs, wherein the goal is to segment the input program into code blocks +corresponding to semantically meaningful shape parts and assign a semantic +label to each block. We solve the problem by combining program parsing with +visual-semantic analysis afforded by recent advances in foundational language +and vision models. Specifically, by executing the input programs, we create +shapes, which we use to generate conditional photorealistic images to make use +of semantic annotators for such images. We then distill the information across +the images and link back to the original programs to semantically comment on +them. Additionally, we collected and annotated a benchmark dataset, CADTalk, +consisting of 5,280 machine-made programs and 45 human-made programs with +ground truth semantic comments to foster future research. We extensively +evaluated our approach, compared to a GPT-based baseline approach, and an +open-set shape segmentation baseline, i.e., PartSLIP, and reported an 83.24% +accuracy on the new CADTalk dataset. Project page: +https://enigma-li.github.io/CADTalk/. + +
+
+
+
+
+ + ♻ ☆ A Continual Learning Paradigm for Non-differentiable Visual Programming + Frameworks on Visual Reasoning Tasks + + +
+ Recently, the visual programming framework (VisProg) has emerged as a +significant framework for executing compositional visual tasks due to its +interpretability and flexibility. However, the performance of VisProg on +specific Visual Reasoning (VR) tasks is markedly inferior compared to +well-trained task-specific models since its employed visual sub-modules have +limited generalization capabilities. Due to the non-differentiability of +VisProg, it is quite challenging to improve these visual sub-modules within +VisProg for the specific VR task while maintaining their generalizability on +the un-seen tasks. Attempt to overcome these difficulties, we propose CLVP, a +Continuous Learning paradigm for VisProg across various visual reasoning tasks. +Specifically, our CLVP distills the capabilities of well-trained task-specific +models into the visual sub-modules in a stepwise and anti-forgetting manner. +This can continually improve the performance of VisProg on multiple visual +tasks while preserving the flexibility of VisProg. Extensive and comprehensive +experimental results demonstrate that our CLVP obtains significant performance +gains on specific VR benchmarks, i.e., GQA (+1.4%) and NLVRv2 (+5.6%), compared +to the VisProg baseline, and also maintains a promising generalizability for VR +on un-seen and previous learned tasks. + +
+
+
+
+
+ + ♻ ☆ Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following + + +
+ Existing text-to-image (T2I) diffusion models usually struggle in +interpreting complex prompts, especially those with quantity, object-attribute +binding, and multi-subject descriptions. In this work, we introduce a semantic +panel as the middleware in decoding texts to images, supporting the generator +to better follow instructions. The panel is obtained through arranging the +visual concepts parsed from the input text by the aid of large language models, +and then injected into the denoising network as a detailed control signal to +complement the text condition. To facilitate text-to-panel learning, we come up +with a carefully designed semantic formatting protocol, accompanied by a +fully-automatic data preparation pipeline. Thanks to such a design, our +approach, which we call Ranni, manages to enhance a pre-trained T2I generator +regarding its textual controllability. More importantly, the introduction of +the generative middleware brings a more convenient form of interaction (i.e., +directly adjusting the elements in the panel or using language instructions) +and further allows users to finely customize their generation, based on which +we develop a practical system and showcase its potential in continuous +generation and chatting-based editing. Our project page is at +https://ranni-t2i.github.io/Ranni. + +
+
+
+
+
+ + ♻ ☆ Editing Large Language Models: Problems, Methods, and Opportunities EMNLP 2023 + + +
+ Despite the ability to train capable LLMs, the methodology for maintaining +their relevancy and rectifying errors remains elusive. To this end, the past +few years have witnessed a surge in techniques for editing LLMs, the objective +of which is to efficiently alter the behavior of LLMs within a specific domain +without negatively impacting performance across other inputs. This paper +embarks on a deep exploration of the problems, methods, and opportunities +related to model editing for LLMs. In particular, we provide an exhaustive +overview of the task definition and challenges associated with model editing, +along with an in-depth empirical analysis of the most progressive methods +currently at our disposal. We also build a new benchmark dataset to facilitate +a more robust evaluation and pinpoint enduring issues intrinsic to existing +techniques. Our objective is to provide valuable insights into the +effectiveness and feasibility of each editing technique, thereby assisting the +community in making informed decisions on the selection of the most appropriate +method for a specific task or context. Code and datasets are available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023. Updated with new experiments +
+
+
+
+
+ + ♻ ☆ ShapeGPT: 3D Shape Generation with A Unified Multi-modal Language Model + + +
+ The advent of large language models, enabling flexibility through +instruction-driven approaches, has revolutionized many traditional generative +tasks, but large models for 3D data, particularly in comprehensively handling +3D shapes with other modalities, are still under-explored. By achieving +instruction-based shape generations, versatile multimodal generative shape +models can significantly benefit various fields like 3D virtual construction +and network-aided design. In this work, we present ShapeGPT, a shape-included +multi-modal framework to leverage strong pre-trained language models to address +multiple shape-relevant tasks. Specifically, ShapeGPT employs a +word-sentence-paragraph framework to discretize continuous shapes into shape +words, further assembles these words for shape sentences, as well as integrates +shape with instructional text for multi-modal paragraphs. To learn this +shape-language model, we use a three-stage training scheme, including shape +representation, multimodal alignment, and instruction-based generation, to +align shape-language codebooks and learn the intricate correlations among these +modalities. Extensive experiments demonstrate that ShapeGPT achieves comparable +performance across shape-relevant tasks, including text-to-shape, +shape-to-text, shape completion, and shape editing. + +
+
+
+
+
+ + ♻ ☆ DocPedia: Unleashing the Power of Large Multimodal Model in the + Frequency Domain for Versatile Document Understanding + + +
+ This work presents DocPedia, a novel large multimodal model (LMM) for +versatile OCR-free document understanding, capable of parsing images up to +2,560$\times$2,560 resolution. Unlike existing work either struggle with +high-resolution documents or give up the large language model thus vision or +language ability constrained, our DocPedia directly processes visual input in +the frequency domain rather than the pixel space. The unique characteristic +enables DocPedia to capture a greater amount of visual and textual information +using a limited number of visual tokens. To consistently enhance both +perception and comprehension abilities of our model, we develop a dual-stage +training strategy and enrich instructions/annotations of all training tasks +covering multiple document types. Extensive quantitative and qualitative +experiments conducted on various publicly available benchmarks confirm the +mutual benefits of jointly learning perception and comprehension tasks. The +results provide further evidence of the effectiveness and superior performance +of our DocPedia over other methods. + +
+
+
+
+
+ + ♻ ☆ LLaFS: When Large-Language Models Meet Few-Shot Segmentation + + +
+ This paper proposes LLaFS, the first attempt to leverage large language +models (LLMs) in few-shot segmentation. In contrast to the conventional +few-shot segmentation methods that only rely on the limited and biased +information from the annotated support images, LLaFS leverages the vast prior +knowledge gained by LLM as an effective supplement and directly uses the LLM to +segment images in a few-shot manner. To enable the text-based LLM to handle +image-related tasks, we carefully design an input instruction that allows the +LLM to produce segmentation results represented as polygons, and propose a +region-attribute table to simulate the human visual mechanism and provide +multi-modal guidance. We also synthesize pseudo samples and use curriculum +learning for pretraining to augment data and achieve better optimization. LLaFS +achieves state-of-the-art results on multiple datasets, showing the potential +of using LLMs for few-shot computer vision tasks. Code will be available at +https://github.com/lanyunzhu99/LLaFS. + +
+
+
+
+
+ + ♻ ☆ W-HMR: Human Mesh Recovery in World Space with Weak-supervised Camera + Calibration and Orientation Correction + + +
+ For a long time, in the field of reconstructing 3D human bodies from +monocular images, most methods opted to simplify the task by minimizing the +influence of the camera. Using a coarse focal length setting results in the +reconstructed bodies not aligning well with distorted images. Ignoring camera +rotation leads to an unrealistic reconstructed body pose in world space. +Consequently, existing methods' application scenarios are confined to +controlled environments. And they struggle to achieve accurate and reasonable +reconstruction in world space when confronted with complex and diverse +in-the-wild images. To address the above issues, we propose W-HMR, which +decouples global body recovery into camera calibration, local body recovery and +global body orientation correction. We design the first weak-supervised camera +calibration method for body distortion, eliminating dependence on focal length +labels and achieving finer mesh-image alignment. We propose a novel orientation +correction module to allow the reconstructed human body to remain normal in +world space. Decoupling body orientation and body pose enables our model to +consider the accuracy in camera coordinate and the reasonableness in world +coordinate simultaneously, expanding the range of applications. As a result, +W-HMR achieves high-quality reconstruction in dual coordinate systems, +particularly in challenging scenes. Codes will be released on +https://yw0208.github.io/w-hmr/ after publication. + +
+
+ comment: Project Page: https://yw0208.github.io/w-hmr/ +
+
+
+
+
+ + ♻ ☆ Beyond the Field-of-View: Enhancing Scene Visibility and Perception with + Clip-Recurrent Transformer + + +
+ Vision sensors are widely applied in vehicles, robots, and roadside +infrastructure. However, due to limitations in hardware cost and system size, +camera Field-of-View (FoV) is often restricted and may not provide sufficient +coverage. Nevertheless, from a spatiotemporal perspective, it is possible to +obtain information beyond the camera's physical FoV from past video streams. In +this paper, we propose the concept of online video inpainting for autonomous +vehicles to expand the field of view, thereby enhancing scene visibility, +perception, and system safety. To achieve this, we introduce the FlowLens +architecture, which explicitly employs optical flow and implicitly incorporates +a novel clip-recurrent transformer for feature propagation. FlowLens offers two +key features: 1) FlowLens includes a newly designed Clip-Recurrent Hub with +3D-Decoupled Cross Attention (DDCA) to progressively process global information +accumulated over time. 2) It integrates a multi-branch Mix Fusion Feed Forward +Network (MixF3N) to enhance the precise spatial flow of local features. To +facilitate training and evaluation, we derive the KITTI360 dataset with various +FoV mask, which covers both outer- and inner FoV expansion scenarios. We also +conduct quantitative assessments of beyond-FoV semantics across different +models and perform qualitative comparisons of beyond-FoV object detection. We +illustrate that employing FlowLens to reconstruct unseen scenes even enhances +perception within the field of view by providing reliable semantic context. +Extensive experiments and user studies involving offline and online video +inpainting, as well as beyond-FoV perception tasks, demonstrate that FlowLens +achieves state-of-the-art performance. The source code and dataset are made +publicly available at https://github.com/MasterHow/FlowLens. + +
+
+ comment: The source code and dataset are made publicly available at + https://github.com/MasterHow/FlowLens +
+
+
+
+
+ + ♻ ☆ Table Detection in the Wild: A Novel Diverse Table Detection Dataset and + Method + + +
+ Recent deep learning approaches in table detection achieved outstanding +performance and proved to be effective in identifying document layouts. +Currently, available table detection benchmarks have many limitations, +including the lack of samples diversity, simple table structure, the lack of +training cases, and samples quality. In this paper, we introduce a diverse +large-scale dataset for table detection with more than seven thousand samples +containing a wide variety of table structures collected from many diverse +sources. In addition to that, we also present baseline results using a +convolutional neural network-based method to detect table structure in +documents. Experimental results show the superiority of applying convolutional +deep learning methods over classical computer vision-based methods. The +introduction of this diverse table detection dataset will enable the community +to develop high throughput deep learning methods for understanding document +layout and tabular data processing. Dataset is available at: 1. +https://www.kaggle.com/datasets/mrinalim/stdw-dataset 2. +https://huggingface.co/datasets/n3011/STDW + +
+
+ comment: Open source Table detection dataset and baseline results +
+
+
+
+
+ + ♻ ☆ Rethinking Domain Generalization: Discriminability and Generalizability + + +
+ Domain generalization (DG) endeavors to develop robust models that possess +strong generalizability while preserving excellent discriminability. +Nonetheless, pivotal DG techniques tend to improve the feature generalizability +by learning domain-invariant representations, inadvertently overlooking the +feature discriminability. On the one hand, the simultaneous attainment of +generalizability and discriminability of features presents a complex challenge, +often entailing inherent contradictions. This challenge becomes particularly +pronounced when domain-invariant features manifest reduced discriminability +owing to the inclusion of unstable factors, \emph{i.e.,} spurious correlations. +On the other hand, prevailing domain-invariant methods can be categorized as +category-level alignment, susceptible to discarding indispensable features +possessing substantial generalizability and narrowing intra-class variations. +To surmount these obstacles, we rethink DG from a new perspective that +concurrently imbues features with formidable discriminability and robust +generalizability, and present a novel framework, namely, Discriminative +Microscopic Distribution Alignment (DMDA). DMDA incorporates two core +components: Selective Channel Pruning~(SCP) and Micro-level Distribution +Alignment (MDA). Concretely, SCP attempts to curtail redundancy within neural +networks, prioritizing stable attributes conducive to accurate classification. +This approach alleviates the adverse effect of spurious domain invariance and +amplifies the feature discriminability. Besides, MDA accentuates micro-level +alignment within each class, going beyond mere category-level alignment. This +strategy accommodates sufficient generalizable features and facilitates +within-class variations. Extensive experiments on four benchmark datasets +corroborate the efficacy of our method. + +
+
+
+
+
+ + ♻ ☆ Motion-DVAE: Unsupervised learning for fast human motion denoising + + +
+ Pose and motion priors are crucial for recovering realistic and accurate +human motion from noisy observations. Substantial progress has been made on +pose and shape estimation from images, and recent works showed impressive +results using priors to refine frame-wise predictions. However, a lot of motion +priors only model transitions between consecutive poses and are used in +time-consuming optimization procedures, which is problematic for many +applications requiring real-time motion capture. We introduce Motion-DVAE, a +motion prior to capture the short-term dependencies of human motion. As part of +the dynamical variational autoencoder (DVAE) models family, Motion-DVAE +combines the generative capability of VAE models and the temporal modeling of +recurrent architectures. Together with Motion-DVAE, we introduce an +unsupervised learned denoising method unifying regression- and +optimization-based approaches in a single framework for real-time 3D human pose +estimation. Experiments show that the proposed approach reaches competitive +performance with state-of-the-art methods while being much faster. + +
+
+
+
+
+ + ♻ ☆ DRM-IR: Task-Adaptive Deep Unfolding Network for All-In-One Image + Restoration + + +
+ Existing All-In-One image restoration (IR) methods usually lack flexible +modeling on various types of degradation, thus impeding the restoration +performance. To achieve All-In-One IR with higher task dexterity, this work +proposes an efficient Dynamic Reference Modeling paradigm (DRM-IR), which +consists of task-adaptive degradation modeling and model-based image restoring. +Specifically, these two subtasks are formalized as a pair of entangled +reference-based maximum a posteriori (MAP) inferences, which are optimized +synchronously in an unfolding-based manner. With the two cascaded subtasks, +DRM-IR first dynamically models the task-specific degradation based on a +reference image pair and further restores the image with the collected +degradation statistics. Besides, to bridge the semantic gap between the +reference and target degraded images, we further devise a Degradation Prior +Transmitter (DPT) that restrains the instance-specific feature differences. +DRM-IR explicitly provides superior flexibility for All-in-One IR while being +interpretable. Extensive experiments on multiple benchmark datasets show that +our DRM-IR achieves state-of-the-art in All-In-One IR. + +
+
+
+
+
+ + ♻ ☆ Med-Tuning: Parameter-Efficient Transfer Learning with Fine-Grained + Feature Enhancement for Medical Volumetric Segmentation + + +
+ Deep learning-based medical volumetric segmentation methods either train the +model from scratch or follow the standard ``pre-training then fine-tuning" +paradigm. Although fine-tuning a pre-trained model on downstream tasks can +harness its representation power, the standard full fine-tuning is costly in +terms of computation and memory footprint. In this paper, we present the study +on parameter-efficient transfer learning for medical volumetric segmentation +and propose a new framework named Med-Tuning based on intra-stage feature +enhancement and inter-stage feature interaction. Additionally, aiming at +exploiting the intrinsic global properties of Fourier Transform for +parameter-efficient transfer learning, a new adapter block namely Med-Adapter +with a well-designed Fourier Transform branch is proposed for effectively and +efficiently modeling the crucial global context for medical volumetric +segmentation. Given a large-scale pre-trained model on 2D natural images, our +method can exploit both the crucial spatial multi-scale feature and volumetric +correlations along slices for accurate segmentation. Extensive experiments on +three benchmark datasets (including CT and MRI) show that our method can +achieve better results than previous parameter-efficient transfer learning +methods on segmentation tasks, with much less tuned parameter costs. Compared +to full fine-tuning, our method reduces the fine-tuned model parameters by up +to 4x, with even better segmentation performance. The code will be made +publicly available at https://github.com/jessie-chen99/Med-Tuning. + +
+
+
+
+
+
+
+
+ + Information Retrieval 17 + +
+
+
+ + ☆ Routing-Guided Learned Product Quantization for Graph-Based Approximate + Nearest Neighbor Search + + +
+ Given a vector dataset $\mathcal{X}$, a query vector $\vec{x}_q$, graph-based +Approximate Nearest Neighbor Search (ANNS) aims to build a proximity graph (PG) +as an index of $\mathcal{X}$ and approximately return vectors with minimum +distances to $\vec{x}_q$ by searching over the PG index. It suffers from the +large-scale $\mathcal{X}$ because a PG with full vectors is too large to fit +into the memory, e.g., a billion-scale $\mathcal{X}$ in 128 dimensions would +consume nearly 600 GB memory. To solve this, Product Quantization (PQ) +integrated graph-based ANNS is proposed to reduce the memory usage, using +smaller compact codes of quantized vectors in memory instead of the large +original vectors. Existing PQ methods do not consider the important routing +features of PG, resulting in low-quality quantized vectors that affect the +ANNS's effectiveness. In this paper, we present an end-to-end Routing-guided +learned Product Quantization (RPQ) for graph-based ANNS. It consists of (1) a +\textit{differentiable quantizer} used to make the standard discrete PQ +differentiable to suit for back-propagation of end-to-end learning, (2) a +\textit{sampling-based feature extractor} used to extract neighborhood and +routing features of a PG, and (3) a \textit{multi-feature joint training +module} with two types of feature-aware losses to continuously optimize the +differentiable quantizer. As a result, the inherent features of a PG would be +embedded into the learned PQ, generating high-quality quantized vectors. +Moreover, we integrate our RPQ with the state-of-the-art DiskANN and existing +popular PGs to improve their performance. Comprehensive experiments on +real-world large-scale datasets (from 1M to 1B) demonstrate RPQ's superiority, +e.g., 1.7$\times$-4.2$\times$ improvement on QPS at the same recall@10 of 95\%. + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ☆ Barwise Music Structure Analysis with the Correlation Block-Matching + Segmentation Algorithm + + +
+ Music Structure Analysis (MSA) is a Music Information Retrieval task +consisting of representing a song in a simplified, organized manner by breaking +it down into sections typically corresponding to ``chorus'', ``verse'', +``solo'', etc. In this work, we extend an MSA algorithm called the Correlation +Block-Matching (CBM) algorithm introduced by (Marmoret et al., 2020, 2022b). +The CBM algorithm is a dynamic programming algorithm that segments +self-similarity matrices, which are a standard description used in MSA and in +numerous other applications. In this work, self-similarity matrices are +computed from the feature representation of an audio signal and time is sampled +at the bar-scale. This study examines three different standard similarity +functions for the computation of self-similarity matrices. Results show that, +in optimal conditions, the proposed algorithm achieves a level of performance +which is competitive with supervised state-of-the-art methods while only +requiring knowledge of bar positions. In addition, the algorithm is made +open-source and is highly customizable. + +
+
+ comment: 19 pages, 13 figures, 11 tables, 1 algorithm, published in + Transactions of the International Society for Music Information Retrieval +
+
+
+
+
+ + ☆ Search Still Matters: Information Retrieval in the Era of Generative AI + + +
+ Objective: Information retrieval (IR, also known as search) systems are +ubiquitous in modern times. How does the emergence of generative artificial +intelligence (AI), based on large language models (LLMs), fit into the IR +process? Process: This perspective explores the use of generative AI in the +context of the motivations, considerations, and outcomes of the IR process with +a focus on the academic use of such systems. Conclusions: There are many +information needs, from simple to complex, that motivate use of IR. Users of +such systems, particularly academics, have concerns for authoritativeness, +timeliness, and contextualization of search. While LLMs may provide +functionality that aids the IR process, the continued need for search systems, +and research into their improvement, remains essential. + +
+
+ comment: 6 pages, no figures +
+
+
+
+
+ + ☆ End-to-End Retrieval with Learned Dense and Sparse Representations Using + Lucene + + +
+ The bi-encoder architecture provides a framework for understanding +machine-learned retrieval models based on dense and sparse vector +representations. Although these representations capture parametric realizations +of the same underlying conceptual framework, their respective implementations +of top-$k$ similarity search require the coordination of different software +components (e.g., inverted indexes, HNSW indexes, and toolkits for neural +inference), often knitted together in complex architectures. In this work, we +ask the following question: What's the simplest design, in terms of requiring +the fewest changes to existing infrastructure, that can support end-to-end +retrieval with modern dense and sparse representations? The answer appears to +be that Lucene is sufficient, as we demonstrate in Anserini, a toolkit for +reproducible information retrieval research. That is, effective retrieval with +modern single-vector neural models can be efficiently performed directly in +Java on the CPU. We examine the implications of this design for information +retrieval researchers pushing the state of the art as well as for software +engineers building production search systems. + +
+
+
+
+
+ + ☆ Poisoning Attacks Against Contrastive Recommender Systems + + +
+ Contrastive learning (CL) has recently gained significant popularity in the +field of recommendation. Its ability to learn without heavy reliance on labeled +data is a natural antidote to the data sparsity issue. Previous research has +found that CL can not only enhance recommendation accuracy but also +inadvertently exhibit remarkable robustness against noise. However, this paper +identifies a vulnerability of CL-based recommender systems: Compared with their +non-CL counterparts, they are even more susceptible to poisoning attacks that +aim to promote target items. Our analysis points to the uniform dispersion of +representations led by the CL loss as the very factor that accounts for this +vulnerability. We further theoretically and empirically demonstrate that the +optimization of CL loss can lead to smooth spectral values of representations. +Based on these insights, we attempt to reveal the potential poisoning attacks +against CL-based recommender systems. The proposed attack encompasses a +dual-objective framework: One that induces a smoother spectral value +distribution to amplify the CL loss's inherent dispersion effect, named +dispersion promotion; and the other that directly elevates the visibility of +target items, named rank promotion. We validate the destructiveness of our +attack model through extensive experimentation on four datasets. By shedding +light on these vulnerabilities, we aim to facilitate the development of more +robust CL-based recommender systems. + +
+
+ comment: 14pages,6 figures,5 tables +
+
+
+
+
+ + ☆ Beyond Two-Tower Matching: Learning Sparse Retrievable + Cross-Interactions for Recommendation SIGIR 2023 + + +
+ Two-tower models are a prevalent matching framework for recommendation, which +have been widely deployed in industrial applications. The success of two-tower +matching attributes to its efficiency in retrieval among a large number of +items, since the item tower can be precomputed and used for fast Approximate +Nearest Neighbor (ANN) search. However, it suffers two main challenges, +including limited feature interaction capability and reduced accuracy in online +serving. Existing approaches attempt to design novel late interactions instead +of dot products, but they still fail to support complex feature interactions or +lose retrieval efficiency. To address these challenges, we propose a new +matching paradigm named SparCode, which supports not only sophisticated feature +interactions but also efficient retrieval. Specifically, SparCode introduces an +all-to-all interaction module to model fine-grained query-item interactions. +Besides, we design a discrete code-based sparse inverted index jointly trained +with the model to achieve effective and efficient model inference. Extensive +experiments have been conducted on open benchmark datasets to demonstrate the +superiority of our framework. The results show that SparCode significantly +improves the accuracy of candidate item matching while retaining the same level +of retrieval efficiency with two-tower models. Our source code will be +available at MindSpore/models. + +
+
+ comment: Accepted by SIGIR 2023. Code will be available at + https://reczoo.github.io/SparCode +
+
+
+
+
+ + ☆ COVID-19 Vaccine Misinformation in Middle Income Countries EMNLP 2023 + + +
+ This paper introduces a multilingual dataset of COVID-19 vaccine +misinformation, consisting of annotated tweets from three middle-income +countries: Brazil, Indonesia, and Nigeria. The expertly curated dataset +includes annotations for 5,952 tweets, assessing their relevance to COVID-19 +vaccines, presence of misinformation, and the themes of the misinformation. To +address challenges posed by domain specificity, the low-resource setting, and +data imbalance, we adopt two approaches for developing COVID-19 vaccine +misinformation detection models: domain-specific pre-training and text +augmentation using a large language model. Our best misinformation detection +models demonstrate improvements ranging from 2.7 to 15.9 percentage points in +macro F1-score compared to the baseline models. Additionally, we apply our +misinformation detection models in a large-scale study of 19 million unlabeled +tweets from the three countries between 2020 and 2022, showcasing the practical +application of our dataset and models for detecting and analyzing vaccine +misinformation in multiple countries and languages. Our analysis indicates that +percentage changes in the number of new COVID-19 cases are positively +associated with COVID-19 vaccine misinformation rates in a staggered manner for +Brazil and Indonesia, and there are significant positive associations between +the misinformation rates across the three countries. + +
+
+ comment: Accepted to EMNLP 2023 (Main conference), 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Benchmarking Robustness of Text-Image Composed Retrieval NeurIPS 2023 + + +
+ Text-image composed retrieval aims to retrieve the target image through the +composed query, which is specified in the form of an image plus some text that +describes desired modifications to the input image. It has recently attracted +attention due to its ability to leverage both information-rich images and +concise language to precisely express the requirements for target images. +However, the robustness of these approaches against real-world corruptions or +further text understanding has never been studied. In this paper, we perform +the first robustness study and establish three new diversified benchmarks for +systematic analysis of text-image composed retrieval against natural +corruptions in both vision and text and further probe textural understanding. +For natural corruption analysis, we introduce two new large-scale benchmark +datasets, CIRR-C and FashionIQ-C for testing in open domain and fashion domain +respectively, both of which apply 15 visual corruptions and 7 textural +corruptions. For textural understanding analysis, we introduce a new diagnostic +dataset CIRR-D by expanding the original raw data with synthetic data, which +contains modified text to better probe textual understanding ability including +numerical variation, attribute variation, object removal, background variation, +and fine-grained evaluation. The code and benchmark datasets are available at +https://github.com/SunTongtongtong/Benchmark-Robustness-Text-Image-Compose-Retrieval. + +
+
+ comment: Accepted by R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot + Learning in Foundation Models at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a comprehensive instruction dataset +designed for the biomolecular domain. Mol-Instructions encompasses three key +components: molecule-oriented instructions, protein-oriented instructions, and +biomolecular text instructions. Each component aims to improve the +understanding and prediction capabilities of LLMs concerning biomolecular +features and behaviors. Through extensive instruction tuning experiments on +LLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large +models' performance in the intricate realm of biomolecular studies, thus +fostering progress in the biomolecular research community. Mol-Instructions is +publicly available for ongoing research and will undergo regular updates to +enhance its applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions, add + more experiments +
+
+
+
+
+ + ♻ ☆ DiskANN++: Efficient Page-based Search over Isomorphic Mapped Graph + Index using Query-sensitivity Entry Vertex + + +
+ Given a vector dataset $\mathcal{X}$ and a query vector $\vec{x}_q$, +graph-based Approximate Nearest Neighbor Search (ANNS) aims to build a graph +index $G$ and approximately return vectors with minimum distances to +$\vec{x}_q$ by searching over $G$. The main drawback of graph-based ANNS is +that a graph index would be too large to fit into the memory especially for a +large-scale $\mathcal{X}$. To solve this, a Product Quantization (PQ)-based +hybrid method called DiskANN is proposed to store a low-dimensional PQ index in +memory and retain a graph index in SSD, thus reducing memory overhead while +ensuring a high search accuracy. However, it suffers from two I/O issues that +significantly affect the overall efficiency: (1) long routing path from an +entry vertex to the query's neighborhood that results in large number of I/O +requests and (2) redundant I/O requests during the routing process. We propose +an optimized DiskANN++ to overcome above issues. Specifically, for the first +issue, we present a query-sensitive entry vertex selection strategy to replace +DiskANN's static graph-central entry vertex by a dynamically determined entry +vertex that is close to the query. For the second I/O issue, we present an +isomorphic mapping on DiskANN's graph index to optimize the SSD layout and +propose an asynchronously optimized Pagesearch based on the optimized SSD +layout as an alternative to DiskANN's beamsearch. Comprehensive experimental +studies on eight real-world datasets demonstrate our DiskANN++'s superiority on +efficiency. We achieve a notable 1.5 X to 2.2 X improvement on QPS compared to +DiskANN, given the same accuracy constraint. + +
+
+ comment: 15 pages including references +
+
+
+
+
+ + ♻ ☆ Editing Large Language Models: Problems, Methods, and Opportunities EMNLP 2023 + + +
+ Despite the ability to train capable LLMs, the methodology for maintaining +their relevancy and rectifying errors remains elusive. To this end, the past +few years have witnessed a surge in techniques for editing LLMs, the objective +of which is to efficiently alter the behavior of LLMs within a specific domain +without negatively impacting performance across other inputs. This paper +embarks on a deep exploration of the problems, methods, and opportunities +related to model editing for LLMs. In particular, we provide an exhaustive +overview of the task definition and challenges associated with model editing, +along with an in-depth empirical analysis of the most progressive methods +currently at our disposal. We also build a new benchmark dataset to facilitate +a more robust evaluation and pinpoint enduring issues intrinsic to existing +techniques. Our objective is to provide valuable insights into the +effectiveness and feasibility of each editing technique, thereby assisting the +community in making informed decisions on the selection of the most appropriate +method for a specific task or context. Code and datasets are available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023. Updated with new experiments +
+
+
+
+
+ + ♻ ☆ Image to Multi-Modal Retrieval for Industrial Scenarios + + +
+ We formally define a novel valuable information retrieval task: +image-to-multi-modal-retrieval (IMMR), where the query is an image and the doc +is an entity with both image and textual description. IMMR task is valuable in +various industrial application. We analyze three key challenges for IMMR: 1) +skewed data and noisy label in metric learning, 2) multi-modality fusion, 3) +effective and efficient training in large-scale industrial scenario. To tackle +the above challenges, we propose a novel framework for IMMR task. Our framework +consists of three components: 1) a novel data governance scheme coupled with a +large-scale classification-based learning paradigm. 2) model architecture +specially designed for multimodal learning, where the proposed concept-aware +modality fusion module adaptively fuse image and text modality. 3. a hybrid +parallel training approach for tackling large-scale training in industrial +scenario. The proposed framework achieves SOTA performance on public datasets +and has been deployed in a real-world industrial search system, leading to +significant improvements in click-through rate and deal number. Code and data +will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ UltraGCN: Ultra Simplification of Graph Convolutional Networks for + Recommendation CIKM 2021 + + +
+ With the recent success of graph convolutional networks (GCNs), they have +been widely applied for recommendation, and achieved impressive performance +gains. The core of GCNs lies in its message passing mechanism to aggregate +neighborhood information. However, we observed that message passing largely +slows down the convergence of GCNs during training, especially for large-scale +recommender systems, which hinders their wide adoption. LightGCN makes an early +attempt to simplify GCNs for collaborative filtering by omitting feature +transformations and nonlinear activations. In this paper, we take one step +further to propose an ultra-simplified formulation of GCNs (dubbed UltraGCN), +which skips infinite layers of message passing for efficient recommendation. +Instead of explicit message passing, UltraGCN resorts to directly approximate +the limit of infinite-layer graph convolutions via a constraint loss. +Meanwhile, UltraGCN allows for more appropriate edge weight assignments and +flexible adjustment of the relative importances among different types of +relationships. This finally yields a simple yet effective UltraGCN model, which +is easy to implement and efficient to train. Experimental results on four +benchmark datasets show that UltraGCN not only outperforms the state-of-the-art +GCN models but also achieves more than 10x speedup over LightGCN. Our source +code will be available at https://reczoo.github.io/UltraGCN. + +
+
+ comment: Accepted by CIKM 2021. Code available at: + https://reczoo.github.io/UltraGCN +
+
+
+
+
+ + ♻ ☆ SimpleX: A Simple and Strong Baseline for Collaborative Filtering CIKM 2021 + + +
+ Collaborative filtering (CF) is a widely studied research topic in +recommender systems. The learning of a CF model generally depends on three +major components, namely interaction encoder, loss function, and negative +sampling. While many existing studies focus on the design of more powerful +interaction encoders, the impacts of loss functions and negative sampling +ratios have not yet been well explored. In this work, we show that the choice +of loss function as well as negative sampling ratio is equivalently important. +More specifically, we propose the cosine contrastive loss (CCL) and further +incorporate it to a simple unified CF model, dubbed SimpleX. Extensive +experiments have been conducted on 11 benchmark datasets and compared with 29 +existing CF models in total. Surprisingly, the results show that, under our CCL +loss and a large negative sampling ratio, SimpleX can surpass most +sophisticated state-of-the-art models by a large margin (e.g., max 48.5% +improvement in NDCG@20 over LightGCN). We believe that SimpleX could not only +serve as a simple strong baseline to foster future research on CF, but also +shed light on the potential research direction towards improving loss function +and negative sampling. Our source code will be available at +https://reczoo.github.io/SimpleX. + +
+
+ comment: Accepted by CIKM 2021. Code available at + https://reczoo.github.io/SimpleX +
+
+
+
+
+ + ♻ ☆ FinalMLP: An Enhanced Two-Stream MLP Model for CTR Prediction AAAI 2023 + + +
+ Click-through rate (CTR) prediction is one of the fundamental tasks for +online advertising and recommendation. While multi-layer perceptron (MLP) +serves as a core component in many deep CTR prediction models, it has been +widely recognized that applying a vanilla MLP network alone is inefficient in +learning multiplicative feature interactions. As such, many two-stream +interaction models (e.g., DeepFM and DCN) have been proposed by integrating an +MLP network with another dedicated network for enhanced CTR prediction. As the +MLP stream learns feature interactions implicitly, existing research focuses +mainly on enhancing explicit feature interactions in the complementary stream. +In contrast, our empirical study shows that a well-tuned two-stream MLP model +that simply combines two MLPs can even achieve surprisingly good performance, +which has never been reported before by existing work. Based on this +observation, we further propose feature gating and interaction aggregation +layers that can be easily plugged to make an enhanced two-stream MLP model, +FinalMLP. In this way, it not only enables differentiated feature inputs but +also effectively fuses stream-level interactions across two streams. Our +evaluation results on four open benchmark datasets as well as an online A/B +test in our industrial system show that FinalMLP achieves better performance +than many sophisticated two-stream CTR models. Our source code will be +available at MindSpore/models. + +
+
+ comment: Accepted by AAAI 2023. Code available at + https://reczoo.github.io/FinalMLP +
+
+
+
+
+ + ♻ ☆ ReLoop2: Building Self-Adaptive Recommendation Models via Responsive + Error Compensation Loop KDD 2023 + + +
+ Industrial recommender systems face the challenge of operating in +non-stationary environments, where data distribution shifts arise from evolving +user behaviors over time. To tackle this challenge, a common approach is to +periodically re-train or incrementally update deployed deep models with newly +observed data, resulting in a continual training process. However, the +conventional learning paradigm of neural networks relies on iterative +gradient-based updates with a small learning rate, making it slow for large +recommendation models to adapt. In this paper, we introduce ReLoop2, a +self-correcting learning loop that facilitates fast model adaptation in online +recommender systems through responsive error compensation. Inspired by the +slow-fast complementary learning system observed in human brains, we propose an +error memory module that directly stores error samples from incoming data +streams. These stored samples are subsequently leveraged to compensate for +model prediction errors during testing, particularly under distribution shifts. +The error memory module is designed with fast access capabilities and undergoes +continual refreshing with newly observed data samples during the model serving +phase to support fast model adaptation. We evaluate the effectiveness of +ReLoop2 on three open benchmark datasets as well as a real-world production +dataset. The results demonstrate the potential of ReLoop2 in enhancing the +responsiveness and adaptiveness of recommender systems operating in +non-stationary environments. + +
+
+ comment: Accepted by KDD 2023 +
+
+
+
+
+ + ♻ ☆ BARS-CTR: Open Benchmarking for Click-Through Rate Prediction CIKM 2021 + + +
+ Click-through rate (CTR) prediction is a critical task for many applications, +as its accuracy has a direct impact on user experience and platform revenue. In +recent years, CTR prediction has been widely studied in both academia and +industry, resulting in a wide variety of CTR prediction models. Unfortunately, +there is still a lack of standardized benchmarks and uniform evaluation +protocols for CTR prediction research. This leads to non-reproducible or even +inconsistent experimental results among existing studies, which largely limits +the practical value and potential impact of their research. In this work, we +aim to perform open benchmarking for CTR prediction and present a rigorous +comparison of different models in a reproducible manner. To this end, we ran +over 7,000 experiments for more than 12,000 GPU hours in total to re-evaluate +24 existing models on multiple datasets and settings. Surprisingly, our +experiments show that with sufficient hyper-parameter search and model tuning, +many deep models have smaller differences than expected. The results also +reveal that making real progress on the modeling of CTR prediction is indeed a +very challenging research task. We believe that our benchmarking work could not +only allow researchers to gauge the effectiveness of new models conveniently +but also make them fairly compare with the state of the arts. We have publicly +released the benchmarking code, evaluation protocols, and hyper-parameter +settings of our work to promote reproducible research in this field. + +
+
+ comment: Accepted by CIKM 2021. See the benchmark at + https://openbenchmark.github.io/BARS/CTR +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Dataset Distillation in Large Data Era + + +
+ Dataset distillation aims to generate a smaller but representative subset +from a large dataset, which allows a model to be trained efficiently, meanwhile +evaluating on the original testing data distribution to achieve decent +performance. Many prior works have aimed to align with diverse aspects of the +original datasets, such as matching the training weight trajectories, gradient, +feature/BatchNorm distributions, etc. In this work, we show how to distill +various large-scale datasets such as full ImageNet-1K/21K under a conventional +input resolution of 224$\times$224 to achieve the best accuracy over all +previous approaches, including SRe$^2$L, TESLA and MTT. To achieve this, we +introduce a simple yet effective ${\bf C}$urriculum ${\bf D}$ata ${\bf +A}$ugmentation ($\texttt{CDA}$) during data synthesis that obtains the accuracy +on large-scale ImageNet-1K and 21K with 63.2% under IPC (Images Per Class) 50 +and 36.1% under IPC 20, respectively. Finally, we show that, by integrating all +our enhancements together, the proposed model beats the current +state-of-the-art by more than 4% Top-1 accuracy on ImageNet-1K/21K and for the +first time, reduces the gap to its full-data training counterpart to less than +absolute 15%. Moreover, this work represents the inaugural success in dataset +distillation on larger-scale ImageNet-21K under the standard 224$\times$224 +resolution. Our code and distilled ImageNet-21K dataset of 20 IPC, 2K recovery +budget are available at https://github.com/VILA-Lab/SRe2L/tree/main/CDA. + +
+
+ comment: Code and distilled ImageNet-21K dataset are available at + https://github.com/VILA-Lab/SRe2L/tree/main/CDA +
+
+
+
+
+ + ☆ VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion + Models + + +
+ Diffusion models have achieved significant success in image and video +generation. This motivates a growing interest in video editing tasks, where +videos are edited according to provided text descriptions. However, most +existing approaches only focus on video editing for short clips and rely on +time-consuming tuning or inference. We are the first to propose Video +Instruction Diffusion (VIDiff), a unified foundation model designed for a wide +range of video tasks. These tasks encompass both understanding tasks (such as +language-guided video object segmentation) and generative tasks (video editing +and enhancement). Our model can edit and translate the desired results within +seconds based on user instructions. Moreover, we design an iterative +auto-regressive method to ensure consistency in editing and enhancing long +videos. We provide convincing generative results for diverse input videos and +written instructions, both qualitatively and quantitatively. More examples can +be found at our website https://ChenHsing.github.io/VIDiff. + +
+
+
+
+
+ + ☆ Motion-Conditioned Image Animation for Video Editing + + +
+ We introduce MoCA, a Motion-Conditioned Image Animation approach for video +editing. It leverages a simple decomposition of the video editing problem into +image editing followed by motion-conditioned image animation. Furthermore, +given the lack of robust evaluation datasets for video editing, we introduce a +new benchmark that measures edit capability across a wide variety of tasks, +such as object replacement, background changes, style changes, and motion +edits. We present a comprehensive human evaluation of the latest video editing +methods along with MoCA, on our proposed benchmark. MoCA establishes a new +state-of-the-art, demonstrating greater human preference win-rate, and +outperforming notable recent approaches including Dreamix (63%), MasaCtrl +(75%), and Tune-A-Video (72%), with especially significant improvements for +motion edits. + +
+
+ comment: Project page: https://facebookresearch.github.io/MoCA +
+
+
+
+
+ + ☆ Geometry-Aware Normalizing Wasserstein Flows for Optimal Causal + Inference + + +
+ This manuscript enriches the framework of continuous normalizing flows (CNFs) +within causal inference, primarily to augment the geometric properties of +parametric submodels used in targeted maximum likelihood estimation (TMLE). By +introducing an innovative application of CNFs, we construct a refined series of +parametric submodels that enable a directed interpolation between the prior +distribution $p_0$ and the empirical distribution $p_1$. This proposed +methodology serves to optimize the semiparametric efficiency bound in causal +inference by orchestrating CNFs to align with Wasserstein gradient flows. Our +approach not only endeavors to minimize the mean squared error in the +estimation but also imbues the estimators with geometric sophistication, +thereby enhancing robustness against misspecification. This robustness is +crucial, as it alleviates the dependence on the standard $n^{\frac{1}{4}}$ rate +for a doubly-robust perturbation direction in TMLE. By incorporating robust +optimization principles and differential geometry into the estimators, the +developed geometry-aware CNFs represent a significant advancement in the +pursuit of doubly robust causal inference. + +
+
+
+
+
+ + ☆ An Adaptive Framework for Generalizing Network Traffic Prediction + towards Uncertain Environments + + +
+ We have developed a new framework using time-series analysis for dynamically +assigning mobile network traffic prediction models in previously unseen +wireless environments. Our framework selectively employs learned behaviors, +outperforming any single model with over a 50% improvement relative to current +studies. More importantly, it surpasses traditional approaches without needing +prior knowledge of a cell. While this paper focuses on network traffic +prediction using our adaptive forecasting framework, this framework can also be +applied to other machine learning applications in uncertain environments. + The framework begins with unsupervised clustering of time-series data to +identify unique trends and seasonal patterns. Subsequently, we apply supervised +learning for traffic volume prediction within each cluster. This specialization +towards specific traffic behaviors occurs without penalties from spatial and +temporal variations. Finally, the framework adaptively assigns trained models +to new, previously unseen cells. By analyzing real-time measurements of a cell, +our framework intelligently selects the most suitable cluster for that cell at +any given time, with cluster assignment dynamically adjusting to +spatio-temporal fluctuations. + +
+
+
+
+
+ + ☆ Initializing Models with Larger Ones + + +
+ Weight initialization plays an important role in neural network training. +Widely used initialization methods are proposed and evaluated for networks that +are trained from scratch. However, the growing number of pretrained models now +offers new opportunities for tackling this classical problem of weight +initialization. In this work, we introduce weight selection, a method for +initializing smaller models by selecting a subset of weights from a pretrained +larger model. This enables the transfer of knowledge from pretrained weights to +smaller models. Our experiments demonstrate that weight selection can +significantly enhance the performance of small models and reduce their training +time. Notably, it can also be used together with knowledge distillation. Weight +selection offers a new approach to leverage the power of pretrained models in +resource-constrained settings, and we hope it can be a useful tool for training +small models in the large-model era. Code is available at +https://github.com/OscarXZQ/weight-selection. + +
+
+
+
+
+ + ☆ Dichotomy of Early and Late Phase Implicit Biases Can Provably Induce + Grokking + + +
+ Recent work by Power et al. (2022) highlighted a surprising "grokking" +phenomenon in learning arithmetic tasks: a neural net first "memorizes" the +training set, resulting in perfect training accuracy but near-random test +accuracy, and after training for sufficiently longer, it suddenly transitions +to perfect test accuracy. This paper studies the grokking phenomenon in +theoretical setups and shows that it can be induced by a dichotomy of early and +late phase implicit biases. Specifically, when training homogeneous neural nets +with large initialization and small weight decay on both classification and +regression tasks, we prove that the training process gets trapped at a solution +corresponding to a kernel predictor for a long time, and then a very sharp +transition to min-norm/max-margin predictors occurs, leading to a dramatic +change in test accuracy. + +
+
+ comment: 39 pages, 4 figures +
+
+
+
+
+ + ☆ Pre-registration for Predictive Modeling + + +
+ Amid rising concerns of reproducibility and generalizability in predictive +modeling, we explore the possibility and potential benefits of introducing +pre-registration to the field. Despite notable advancements in predictive +modeling, spanning core machine learning tasks to various scientific +applications, challenges such as overlooked contextual factors, data-dependent +decision-making, and unintentional re-use of test data have raised questions +about the integrity of results. To address these issues, we propose adapting +pre-registration practices from explanatory modeling to predictive modeling. We +discuss current best practices in predictive modeling and their limitations, +introduce a lightweight pre-registration template, and present a qualitative +study with machine learning researchers to gain insight into the effectiveness +of pre-registration in preventing biased estimates and promoting more reliable +research outcomes. We conclude by exploring the scope of problems that +pre-registration can address in predictive modeling and acknowledging its +limitations within this context. + +
+
+
+
+
+ + ☆ Efficient Baseline for Quantitative Precipitation Forecasting in + Weather4cast 2023 + + +
+ Accurate precipitation forecasting is indispensable for informed +decision-making across various industries. However, the computational demands +of current models raise environmental concerns. We address the critical need +for accurate precipitation forecasting while considering the environmental +impact of computational resources and propose a minimalist U-Net architecture +to be used as a baseline for future weather forecasting initiatives. + +
+
+ comment: 5 pages, 1 figure, Weather4Cast 2023 challenge +
+
+
+
+
+ + ☆ BIOCLIP: A Vision Foundation Model for the Tree of Life + + +
+ Images of the natural world, collected by a variety of cameras, from drones +to individual phones, are increasingly abundant sources of biological +information. There is an explosion of computational methods and tools, +particularly computer vision, for extracting biologically relevant information +from images for science and conservation. Yet most of these are bespoke +approaches designed for a specific task and are not easily adaptable or +extendable to new questions, contexts, and datasets. A vision model for general +organismal biology questions on images is of timely need. To approach this, we +curate and release TreeOfLife-10M, the largest and most diverse ML-ready +dataset of biology images. We then develop BioCLIP, a foundation model for the +tree of life, leveraging the unique properties of biology captured by +TreeOfLife-10M, namely the abundance and variety of images of plants, animals, +and fungi, together with the availability of rich structured biological +knowledge. We rigorously benchmark our approach on diverse fine-grained biology +classification tasks, and find that BioCLIP consistently and substantially +outperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation +reveals that BioCLIP has learned a hierarchical representation conforming to +the tree of life, shedding light on its strong generalizability. Our code, +models and data will be made available at +https://github.com/Imageomics/bioclip. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Communication-Efficient Federated Optimization over Semi-Decentralized + Networks + + +
+ In large-scale federated and decentralized learning, communication efficiency +is one of the most challenging bottlenecks. While gossip communication -- where +agents can exchange information with their connected neighbors -- is more +cost-effective than communicating with the remote server, it often requires a +greater number of communication rounds, especially for large and sparse +networks. To tackle the trade-off, we examine the communication efficiency +under a semi-decentralized communication protocol, in which agents can perform +both agent-to-agent and agent-to-server communication in a probabilistic +manner. We design a tailored communication-efficient algorithm over +semi-decentralized networks, referred to as PISCO, which inherits the +robustness to data heterogeneity thanks to gradient tracking and allows +multiple local updates for saving communication. We establish the convergence +rate of PISCO for nonconvex problems and show that PISCO enjoys a linear +speedup in terms of the number of agents and local updates. Our numerical +results highlight the superior communication efficiency of PISCO and its +resilience to data heterogeneity and various network topologies. + +
+
+
+
+
+ + ☆ MultiResFormer: Transformer with Adaptive Multi-Resolution Modeling for + General Time Series Forecasting + + +
+ Transformer-based models have greatly pushed the boundaries of time series +forecasting recently. Existing methods typically encode time series data into +$\textit{patches}$ using one or a fixed set of patch lengths. This, however, +could result in a lack of ability to capture the variety of intricate temporal +dependencies present in real-world multi-periodic time series. In this paper, +we propose MultiResFormer, which dynamically models temporal variations by +adaptively choosing optimal patch lengths. Concretely, at the beginning of each +layer, time series data is encoded into several parallel branches, each using a +detected periodicity, before going through the transformer encoder block. We +conduct extensive evaluations on long- and short-term forecasting datasets +comparing MultiResFormer with state-of-the-art baselines. MultiResFormer +outperforms patch-based Transformer baselines on long-term forecasting tasks +and also consistently outperforms CNN baselines by a large margin, while using +much fewer parameters than these baselines. + +
+
+
+
+
+ + ☆ CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation + + +
+ We present CoDi-2, a versatile and interactive Multimodal Large Language +Model (MLLM) that can follow complex multimodal interleaved instructions, +conduct in-context learning (ICL), reason, chat, edit, etc., in an any-to-any +input-output modality paradigm. By aligning modalities with language for both +encoding and generation, CoDi-2 empowers Large Language Models (LLMs) to not +only understand complex modality-interleaved instructions and in-context +examples, but also autoregressively generate grounded and coherent multimodal +outputs in the continuous feature space. To train CoDi-2, we build a +large-scale generation dataset encompassing in-context multimodal instructions +across text, vision, and audio. CoDi-2 demonstrates a wide range of zero-shot +capabilities for multimodal generation, such as in-context learning, reasoning, +and compositionality of any-to-any modality generation through multi-round +interactive conversation. CoDi-2 surpasses previous domain-specific models on +tasks such as subject-driven image generation, vision transformation, and audio +editing. CoDi-2 signifies a substantial breakthrough in developing a +comprehensive multimodal foundation model adept at interpreting in-context +language-vision-audio interleaved instructions and producing multimodal +outputs. + +
+
+ comment: Project Page: https://codi-2.github.io/ +
+
+
+
+
+ + ☆ Online Change Points Detection for Linear Dynamical Systems with Finite + Sample Guarantees + + +
+ The problem of online change point detection is to detect abrupt changes in +properties of time series, ideally as soon as possible after those changes +occur. Existing work on online change point detection either assumes i.i.d +data, focuses on asymptotic analysis, does not present theoretical guarantees +on the trade-off between detection accuracy and detection delay, or is only +suitable for detecting single change points. In this work, we study the online +change point detection problem for linear dynamical systems with unknown +dynamics, where the data exhibits temporal correlations and the system could +have multiple change points. We develop a data-dependent threshold that can be +used in our test that allows one to achieve a pre-specified upper bound on the +probability of making a false alarm. We further provide a finite-sample-based +bound for the probability of detecting a change point. Our bound demonstrates +how parameters used in our algorithm affect the detection probability and +delay, and provides guidance on the minimum required time between changes to +guarantee detection. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ MLLMs-Augmented Visual-Language Representation Learning + + +
+ Visual-language pre-training (VLP) have achieved remarkable success in +multi-modal tasks, largely attributed to the availability of large-scale +image-text datasets. In this work, we demonstrate that multi-modal large +language models (MLLMs) can enhance visual-language representation learning by +improving data quality. Our approach is simple, utilizing MLLMs to extend +multiple captions for each image. To prevent the bias that introduced by MLLMs' +hallucinations and intrinsic caption styles, we propose a "text shearing" to +keep the lengths of extended captions identical to the originals. In image-text +retrieval, our method consistently obtains 5.6 ~ 35.0% and 16.8 ~ 46.1% +improvement on R@1 under the fine-tuning and zero-shot settings, respectively. +Notably, our zero-shot results are comparable to fine-tuning on target +datasets, which encourages more exploration on the versatile use of MLLMs. + +
+
+
+
+
+ + ☆ Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters + + +
+ Recent work has demonstrated a remarkable ability to customize text-to-image +diffusion models to multiple, fine-grained concepts in a sequential (i.e., +continual) manner while only providing a few example images for each concept. +This setting is known as continual diffusion. Here, we ask the question: Can we +scale these methods to longer concept sequences without forgetting? Although +prior work mitigates the forgetting of previously learned concepts, we show +that its capacity to learn new tasks reaches saturation over longer sequences. +We address this challenge by introducing a novel method, STack-And-Mask +INcremental Adapters (STAMINA), which is composed of low-ranked +attention-masked adapters and customized MLP tokens. STAMINA is designed to +enhance the robust fine-tuning properties of LoRA for sequential concept +learning via learnable hard-attention masks parameterized with low rank MLPs, +enabling precise, scalable learning via sparse adaptation. Notably, all +introduced trainable parameters can be folded back into the model after +training, inducing no additional inference parameter costs. We show that +STAMINA outperforms the prior SOTA for the setting of text-to-image continual +customization on a 50-concept benchmark composed of landmarks and human faces, +with no stored replay data. Additionally, we extended our method to the setting +of continual learning for image classification, demonstrating that our gains +also translate to state-of-the-art performance in this standard benchmark. + +
+
+
+
+
+ + ☆ Language Model Agents Suffer from Compositional Generalization in Web + Automation + + +
+ Language model agents (LMA) recently emerged as a promising paradigm on +muti-step decision making tasks, often outperforming humans and other +reinforcement learning agents. Despite the promise, their performance on +real-world applications that often involve combinations of tasks is still +underexplored. In this work, we introduce a new benchmark, called CompWoB -- 50 +new compositional web automation tasks reflecting more realistic assumptions. +We show that while existing prompted LMAs (gpt-3.5-turbo or gpt-4) achieve +94.0% average success rate on base tasks, their performance degrades to 24.9% +success rate on compositional tasks. On the other hand, transferred LMAs +(finetuned only on base tasks) show less generalization gap, dropping from +85.4% to 54.8%. By balancing data distribution across tasks, we train a new +model, HTML-T5++, that surpasses human-level performance (95.2%) on MiniWoB, +and achieves the best zero-shot performance on CompWoB (61.5%). While these +highlight the promise of small-scale finetuned and transferred models for +compositional generalization, their performance further degrades under +different instruction compositions changing combinational order. In contrast to +the recent remarkable success of LMA, our benchmark and detailed analysis +emphasize the necessity of building LMAs that are robust and generalizable to +task compositionality for real-world deployment. + +
+
+ comment: Code: + https://github.com/google-research/google-research/tree/master/compositional_rl/compwob +
+
+
+
+
+ + ☆ TransCORALNet: A Two-Stream Transformer CORAL Networks for Supply Chain + Credit Assessment Cold Start + + +
+ This paper proposes an interpretable two-stream transformer CORAL networks +(TransCORALNet) for supply chain credit assessment under the segment industry +and cold start problem. The model aims to provide accurate credit assessment +prediction for new supply chain borrowers with limited historical data. Here, +the two-stream domain adaptation architecture with correlation alignment +(CORAL) loss is used as a core model and is equipped with transformer, which +provides insights about the learned features and allow efficient +parallelization during training. Thanks to the domain adaptation capability of +the proposed model, the domain shift between the source and target domain is +minimized. Therefore, the model exhibits good generalization where the source +and target do not follow the same distribution, and a limited amount of target +labeled instances exist. Furthermore, we employ Local Interpretable +Model-agnostic Explanations (LIME) to provide more insight into the model +prediction and identify the key features contributing to supply chain credit +assessment decisions. The proposed model addresses four significant supply +chain credit assessment challenges: domain shift, cold start, imbalanced-class +and interpretability. Experimental results on a real-world data set demonstrate +the superiority of TransCORALNet over a number of state-of-the-art baselines in +terms of accuracy. The code is available on GitHub +https://github.com/JieJieNiu/TransCORALN . + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ A data-science pipeline to enable the Interpretability of Many-Objective + Feature Selection + + +
+ Many-Objective Feature Selection (MOFS) approaches use four or more +objectives to determine the relevance of a subset of features in a supervised +learning task. As a consequence, MOFS typically returns a large set of +non-dominated solutions, which have to be assessed by the data scientist in +order to proceed with the final choice. Given the multi-variate nature of the +assessment, which may include criteria (e.g. fairness) not related to +predictive accuracy, this step is often not straightforward and suffers from +the lack of existing tools. For instance, it is common to make use of a tabular +presentation of the solutions, which provide little information about the +trade-offs and the relations between criteria over the set of solutions. + This paper proposes an original methodology to support data scientists in the +interpretation and comparison of the MOFS outcome by combining post-processing +and visualisation of the set of solutions. The methodology supports the data +scientist in the selection of an optimal feature subset by providing her with +high-level information at three different levels: objectives, solutions, and +individual features. + The methodology is experimentally assessed on two feature selection tasks +adopting a GA-based MOFS with six objectives (number of selected features, +balanced accuracy, F1-Score, variance inflation factor, statistical parity, and +equalised odds). The results show the added value of the methodology in the +selection of the final subset of features. + +
+
+ comment: 8 pages, 5 figures, 6 tables +
+
+
+
+
+ + ☆ $\mathbb{Z}_2\times \mathbb{Z}_2$ Equivariant Quantum Neural Networks: + Benchmarking against Classical Neural Networks + + +
+ This paper presents a comprehensive comparative analysis of the performance +of Equivariant Quantum Neural Networks (EQNN) and Quantum Neural Networks +(QNN), juxtaposed against their classical counterparts: Equivariant Neural +Networks (ENN) and Deep Neural Networks (DNN). We evaluate the performance of +each network with two toy examples for a binary classification task, focusing +on model complexity (measured by the number of parameters) and the size of the +training data set. Our results show that the $\mathbb{Z}_2\times \mathbb{Z}_2$ +EQNN and the QNN provide superior performance for smaller parameter sets and +modest training data samples. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ AlignBench: Benchmarking Chinese Alignment of Large Language Models + + +
+ Alignment has become a critical step for instruction-tuned Large Language +Models (LLMs) to become helpful assistants. However, effective evaluation of +alignment for emerging Chinese LLMs is still significantly lacking, calling for +real-scenario grounded, open-ended, challenging and automatic evaluations +tailored for alignment. To fill in this gap, we introduce AlignBench, a +comprehensive multi-dimensional benchmark for evaluating LLMs' alignment in +Chinese. Equipped with a human-in-the-loop data curation pipeline, our +benchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with +Chain-of-Thought to generate explanations and final ratings as evaluations, +ensuring high reliability and interpretability. Furthermore, we developed a +dedicated companion evaluator LLM -- CritiqueLLM, which recovers 95\% of +GPT-4's evaluation ability and will be provided via public APIs to researchers +for evaluation of alignment in Chinese LLMs. All evaluation codes, data, and +LLM generations are available at \url{https://github.com/THUDM/AlignBench}. + +
+
+
+
+
+ + ☆ VREM-FL: Mobility-Aware Computation-Scheduling Co-Design for Vehicular + Federated Learning + + +
+ Assisted and autonomous driving are rapidly gaining momentum, and will soon +become a reality. Among their key enablers, artificial intelligence and machine +learning are expected to play a prominent role, also thanks to the massive +amount of data that smart vehicles will collect from their onboard sensors. In +this domain, federated learning is one of the most effective and promising +techniques for training global machine learning models, while preserving data +privacy at the vehicles and optimizing communications resource usage. In this +work, we propose VREM-FL, a computation-scheduling co-design for vehicular +federated learning that leverages mobility of vehicles in conjunction with +estimated 5G radio environment maps. VREM-FL jointly optimizes the global model +learned at the server while wisely allocating communication resources. This is +achieved by orchestrating local computations at the vehicles in conjunction +with the transmission of their local model updates in an adaptive and +predictive fashion, by exploiting radio channel maps. The proposed algorithm +can be tuned to trade model training time for radio resource usage. +Experimental results demonstrate the efficacy of utilizing radio maps. VREM-FL +outperforms literature benchmarks for both a linear regression model (learning +time reduced by 28%) and a deep neural network for a semantic image +segmentation task (doubling the number of model updates within the same time +window). + +
+
+ comment: This work has been submitted to IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Controlgym: Large-Scale Safety-Critical Control Environments for + Benchmarking Reinforcement Learning Algorithms + + +
+ We introduce controlgym, a library of thirty-six safety-critical industrial +control settings, and ten infinite-dimensional partial differential equation +(PDE)-based control problems. Integrated within the OpenAI Gym/Gymnasium (Gym) +framework, controlgym allows direct applications of standard reinforcement +learning (RL) algorithms like stable-baselines3. Our control environments +complement those in Gym with continuous, unbounded action and observation +spaces, motivated by real-world control applications. Moreover, the PDE control +environments uniquely allow the users to extend the state dimensionality of the +system to infinity while preserving the intrinsic dynamics. This feature is +crucial for evaluating the scalability of RL algorithms for control. This +project serves the learning for dynamics & control (L4DC) community, aiming to +explore key questions: the convergence of RL algorithms in learning control +policies; the stability and robustness issues of learning-based controllers; +and the scalability of RL algorithms to high- and potentially +infinite-dimensional systems. We open-source the controlgym project at +https://github.com/xiangyuan-zhang/controlgym. + +
+
+ comment: 25 pages, 16 figures +
+
+
+
+
+ + ☆ Dimension Mixer: A Generalized Method for Structured Sparsity in Deep + Neural Networks + + +
+ The recent success of multiple neural architectures like CNNs, Transformers, +and MLP-Mixers motivated us to look for similarities and differences between +them. We found that these architectures can be interpreted through the lens of +a general concept of dimension mixing. Research on coupling flows and the +butterfly transform shows that partial and hierarchical signal mixing schemes +are sufficient for efficient and expressive function approximation. In this +work, we study group-wise sparse, non-linear, multi-layered and learnable +mixing schemes of inputs and find that they are complementary to many standard +neural architectures. Following our observations and drawing inspiration from +the Fast Fourier Transform, we generalize Butterfly Structure to use non-linear +mixer function allowing for MLP as mixing function called Butterfly MLP. We +were also able to mix along sequence dimension for Transformer-based +architectures called Butterfly Attention. Experiments on CIFAR and LRA datasets +demonstrate that the proposed Non-Linear Butterfly Mixers are efficient and +scale well when the host architectures are used as mixing function. +Additionally, we propose Patch-Only MLP-Mixer for processing spatial 2D signals +demonstrating a different dimension mixing strategy. + +
+
+ comment: 11 pages, 4 figures, 7 tables +
+
+
+
+
+ + ☆ Indoor Millimeter Wave Localization using Multiple Self-Supervised Tiny + Neural Networks + + +
+ We consider the localization of a mobile millimeter-wave client in a large +indoor environment using multilayer perceptron neural networks (NNs). Instead +of training and deploying a single deep model, we proceed by choosing among +multiple tiny NNs trained in a self-supervised manner. The main challenge then +becomes to determine and switch to the best NN among the available ones, as an +incorrect NN will fail to localize the client. In order to upkeep the +localization accuracy, we propose two switching schemes: one based on a Kalman +filter, and one based on the statistical distribution of the training data. We +analyze the proposed schemes via simulations, showing that our approach +outperforms both geometric localization schemes and the use of a single NN. + +
+
+ comment: 5 pages, 7 figures. Under Review +
+
+
+
+
+ + ☆ Automatic Functional Differentiation in JAX + + +
+ We extend JAX with the capability to automatically differentiate higher-order +functions (functionals and operators). By representing functions as a +generalization of arrays, we seamlessly use JAX's existing primitive system to +implement higher-order functions. We present a set of primitive operators that +serve as foundational building blocks for constructing several key types of +functionals. For every introduced primitive operator, we derive and implement +both linearization and transposition rules, aligning with JAX's internal +protocols for forward and reverse mode automatic differentiation. This +enhancement allows for functional differentiation in the same syntax +traditionally use for functions. The resulting functional gradients are +themselves functions ready to be invoked in python. We showcase this tool's +efficacy and simplicity through applications where functional derivatives are +indispensable. The source code of this work is released at +https://github.com/sail-sg/autofd . + +
+
+
+
+
+ + ☆ AI in Pharma for Personalized Sequential Decision-Making: Methods, + Applications and Opportunities + + +
+ In the pharmaceutical industry, the use of artificial intelligence (AI) has +seen consistent growth over the past decade. This rise is attributed to major +advancements in statistical machine learning methodologies, computational +capabilities and the increased availability of large datasets. AI techniques +are applied throughout different stages of drug development, ranging from drug +discovery to post-marketing benefit-risk assessment. Kolluri et al. provided a +review of several case studies that span these stages, featuring key +applications such as protein structure prediction, success probability +estimation, subgroup identification, and AI-assisted clinical trial monitoring. +From a regulatory standpoint, there was a notable uptick in submissions +incorporating AI components in 2021. The most prevalent therapeutic areas +leveraging AI were oncology (27%), psychiatry (15%), gastroenterology (12%), +and neurology (11%). The paradigm of personalized or precision medicine has +gained significant traction in recent research, partly due to advancements in +AI techniques \cite{hamburg2010path}. This shift has had a transformative +impact on the pharmaceutical industry. Departing from the traditional +"one-size-fits-all" model, personalized medicine incorporates various +individual factors, such as environmental conditions, lifestyle choices, and +health histories, to formulate customized treatment plans. By utilizing +sophisticated machine learning algorithms, clinicians and researchers are +better equipped to make informed decisions in areas such as disease prevention, +diagnosis, and treatment selection, thereby optimizing health outcomes for each +individual. + +
+
+
+
+
+ + ☆ Steering Deep Feature Learning with Backward Aligned Feature Updates + + +
+ Deep learning succeeds by doing hierarchical feature learning, yet tuning +Hyper-Parameters (HP) such as initialization scales, learning rates etc., only +give indirect control over this behavior. In this paper, we propose the +alignment between the feature updates and the backward pass as a key notion to +predict, measure and control feature learning. On the one hand, we show that +when alignment holds, the magnitude of feature updates after one SGD step is +related to the magnitude of the forward and backward passes by a simple and +general formula. This leads to techniques to automatically adjust HPs +(initialization scales and learning rates) at initialization and throughout +training to attain a desired feature learning behavior. On the other hand, we +show that, at random initialization, this alignment is determined by the +spectrum of a certain kernel, and that well-conditioned layer-to-layer +Jacobians (aka dynamical isometry) implies alignment. Finally, we investigate +ReLU MLPs and ResNets in the large width-then-depth limit. Combining hints from +random matrix theory and numerical experiments, we show that (i) in MLP with +iid initializations, alignment degenerates with depth, making it impossible to +start training, and that (ii) in ResNets, the branch scale +$1/\sqrt{\text{depth}}$ is the only one maintaining non-trivial alignment at +infinite depth. + +
+
+
+
+
+ + ☆ Meta-Prior: Meta learning for Adaptive Inverse Problem Solvers + + +
+ Deep neural networks have become a foundational tool for addressing imaging +inverse problems. They are typically trained for a specific task, with a +supervised loss to learn a mapping from the observations to the image to +recover. However, real-world imaging challenges often lack ground truth data, +rendering traditional supervised approaches ineffective. Moreover, for each new +imaging task, a new model needs to be trained from scratch, wasting time and +resources. To overcome these limitations, we introduce a novel approach based +on meta-learning. Our method trains a meta-model on a diverse set of imaging +tasks that allows the model to be efficiently fine-tuned for specific tasks +with few fine-tuning steps. We show that the proposed method extends to the +unsupervised setting, where no ground truth data is available. In its bilevel +formulation, the outer level uses a supervised loss, that evaluates how well +the fine-tuned model performs, while the inner loss can be either supervised or +unsupervised, relying only on the measurement operator. This allows the +meta-model to leverage a few ground truth samples for each task while being +able to generalize to new imaging tasks. We show that in simple settings, this +approach recovers the Bayes optimal estimator, illustrating the soundness of +our approach. We also demonstrate our method's effectiveness on various tasks, +including image processing and magnetic resonance imaging. + +
+
+
+
+
+ + ☆ Predictable Reinforcement Learning Dynamics through Entropy Rate + Minimization + + +
+ In Reinforcement Learning (RL), agents have no incentive to exhibit +predictable behaviors, and are often pushed (through e.g. policy entropy +regularization) to randomize their actions in favor of exploration. From a +human perspective, this makes RL agents hard to interpret and predict, and from +a safety perspective, even harder to formally verify. We propose a novel method +to induce predictable behavior in RL agents, referred to as +Predictability-Aware RL (PA-RL), which employs the state sequence entropy rate +as a predictability measure. We show how the entropy rate can be formulated as +an average reward objective, and since its entropy reward function is +policy-dependent, we introduce an action-dependent surrogate entropy enabling +the use of PG methods. We prove that deterministic policies minimizing the +average surrogate reward exist and also minimize the actual entropy rate, and +show how, given a learned dynamical model, we are able to approximate the value +function associated to the true entropy rate. Finally, we demonstrate the +effectiveness of the approach in RL tasks inspired by human-robot use-cases, +and show how it produces agents with more predictable behavior while achieving +near-optimal rewards. + +
+
+
+
+
+ + ☆ Seg2Reg: Differentiable 2D Segmentation to 1D Regression Rendering for + 360 Room Layout Reconstruction + + +
+ State-of-the-art single-view 360-degree room layout reconstruction methods +formulate the problem as a high-level 1D (per-column) regression task. On the +other hand, traditional low-level 2D layout segmentation is simpler to learn +and can represent occluded regions, but it requires complex post-processing for +the targeting layout polygon and sacrifices accuracy. We present Seg2Reg to +render 1D layout depth regression from the 2D segmentation map in a +differentiable and occlusion-aware way, marrying the merits of both sides. +Specifically, our model predicts floor-plan density for the input +equirectangular 360-degree image. Formulating the 2D layout representation as a +density field enables us to employ `flattened' volume rendering to form 1D +layout depth regression. In addition, we propose a novel 3D warping +augmentation on layout to improve generalization. Finally, we re-implement +recent room layout reconstruction methods into our codebase for benchmarking +and explore modern backbones and training techniques to serve as the strong +baseline. Our model significantly outperforms previous arts. The code will be +made available upon publication. + +
+
+
+
+
+ + ☆ Balancing Summarization and Change Detection in Graph Streams ICDM2023 + + +
+ This study addresses the issue of balancing graph summarization and graph +change detection. Graph summarization compresses large-scale graphs into a +smaller scale. However, the question remains: To what extent should the +original graph be compressed? This problem is solved from the perspective of +graph change detection, aiming to detect statistically significant changes +using a stream of summary graphs. If the compression rate is extremely high, +important changes can be ignored, whereas if the compression rate is extremely +low, false alarms may increase with more memory. This implies that there is a +trade-off between compression rate in graph summarization and accuracy in +change detection. We propose a novel quantitative methodology to balance this +trade-off to simultaneously realize reliable graph summarization and change +detection. We introduce a probabilistic structure of hierarchical latent +variable model into a graph, thereby designing a parameterized summary graph on +the basis of the minimum description length principle. The parameter specifying +the summary graph is then optimized so that the accuracy of change detection is +guaranteed to suppress Type I error probability (probability of raising false +alarms) to be less than a given confidence level. First, we provide a +theoretical framework for connecting graph summarization with change detection. +Then, we empirically demonstrate its effectiveness on synthetic and real +datasets. + +
+
+ comment: 6 pages, Accepted to 23rd IEEE International Conference on Data + Mining (ICDM2023) +
+
+
+
+
+ + ☆ Handling Cost and Constraints with Off-Policy Deep Reinforcement + Learning + + +
+ By reusing data throughout training, off-policy deep reinforcement learning +algorithms offer improved sample efficiency relative to on-policy approaches. +For continuous action spaces, the most popular methods for off-policy learning +include policy improvement steps where a learned state-action ($Q$) value +function is maximized over selected batches of data. These updates are often +paired with regularization to combat associated overestimation of $Q$ values. +With an eye toward safety, we revisit this strategy in environments with +"mixed-sign" reward functions; that is, with reward functions that include +independent positive (incentive) and negative (cost) terms. This setting is +common in real-world applications, and may be addressed with or without +constraints on the cost terms. We find the combination of function +approximation and a term that maximizes $Q$ in the policy update to be +problematic in such environments, because systematic errors in value estimation +impact the contributions from the competing terms asymmetrically. This results +in overemphasis of either incentives or costs and may severely limit learning. +We explore two remedies to this issue. First, consistent with prior work, we +find that periodic resetting of $Q$ and policy networks can be used to reduce +value estimation error and improve learning in this setting. Second, we +formulate novel off-policy actor-critic methods for both unconstrained and +constrained learning that do not explicitly maximize $Q$ in the policy update. +We find that this second approach, when applied to continuous action spaces +with mixed-sign rewards, consistently and significantly outperforms +state-of-the-art methods augmented by resetting. We further find that our +approach produces agents that are both competitive with popular methods overall +and more reliably competent on frequently-studied control problems that do not +have mixed-sign rewards. + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+ + ☆ A Comparison Between Invariant and Equivariant Classical and Quantum + Graph Neural Networks + + +
+ Machine learning algorithms are heavily relied on to understand the vast +amounts of data from high-energy particle collisions at the CERN Large Hadron +Collider (LHC). The data from such collision events can naturally be +represented with graph structures. Therefore, deep geometric methods, such as +graph neural networks (GNNs), have been leveraged for various data analysis +tasks in high-energy physics. One typical task is jet tagging, where jets are +viewed as point clouds with distinct features and edge connections between +their constituent particles. The increasing size and complexity of the LHC +particle datasets, as well as the computational models used for their analysis, +greatly motivate the development of alternative fast and efficient +computational paradigms such as quantum computation. In addition, to enhance +the validity and robustness of deep networks, one can leverage the fundamental +symmetries present in the data through the use of invariant inputs and +equivariant layers. In this paper, we perform a fair and comprehensive +comparison between classical graph neural networks (GNNs) and equivariant graph +neural networks (EGNNs) and their quantum counterparts: quantum graph neural +networks (QGNNs) and equivariant quantum graph neural networks (EQGNN). The +four architectures were benchmarked on a binary classification task to classify +the parton-level particle initiating the jet. Based on their AUC scores, the +quantum networks were shown to outperform the classical networks. However, +seeing the computational advantage of the quantum networks in practice may have +to wait for the further development of quantum technology and its associated +APIs. + +
+
+ comment: 14 pages, 7 figures, 3 appendices +
+
+
+
+
+ + ☆ Choosing the parameter of the Fermat distance: navigating geometry and + noise + + +
+ The Fermat distance has been recently established as a useful tool for +machine learning tasks when a natural distance is not directly available to the +practitioner or to improve the results given by Euclidean distances by +exploding the geometrical and statistical properties of the dataset. This +distance depends on a parameter $\alpha$ that greatly impacts the performance +of subsequent tasks. Ideally, the value of $\alpha$ should be large enough to +navigate the geometric intricacies inherent to the problem. At the same, it +should remain restrained enough to sidestep any deleterious ramifications +stemming from noise during the process of distance estimation. We study both +theoretically and through simulations how to select this parameter. + +
+
+
+
+
+ + ☆ Targeted Reduction of Causal Models + + +
+ Why does a phenomenon occur? Addressing this question is central to most +scientific inquiries based on empirical observations, and often heavily relies +on simulations of scientific models. As models become more intricate, +deciphering the causes behind these phenomena in high-dimensional spaces of +interconnected variables becomes increasingly challenging. Causal machine +learning may assist scientists in the discovery of relevant and interpretable +patterns of causation in simulations. We introduce Targeted Causal Reduction +(TCR), a method for turning complex models into a concise set of causal factors +that explain a specific target phenomenon. We derive an information theoretic +objective to learn TCR from interventional data or simulations and propose +algorithms to optimize this objective efficiently. TCR's ability to generate +interpretable high-level explanations from complex models is demonstrated on +toy and mechanical systems, illustrating its potential to assist scientists in +the study of complex phenomena in a broad range of disciplines. + +
+
+
+
+
+ + ☆ Contrastive Denoising Score for Text-guided Latent Diffusion Image + Editing + + +
+ With the remarkable advent of text-to-image diffusion models, image editing +methods have become more diverse and continue to evolve. A promising recent +approach in this realm is Delta Denoising Score (DDS) - an image editing +technique based on Score Distillation Sampling (SDS) framework that leverages +the rich generative prior of text-to-image diffusion models. However, relying +solely on the difference between scoring functions is insufficient for +preserving specific structural elements from the original image, a crucial +aspect of image editing. Inspired by the similarity and importance differences +between DDS and the contrastive learning for unpaired image-to-image +translation (CUT), here we present an embarrassingly simple yet very powerful +modification of DDS, called Contrastive Denoising Score (CDS), for latent +diffusion models (LDM). Specifically, to enforce structural correspondence +between the input and output while maintaining the controllability of contents, +we introduce a straightforward approach to regulate structural consistency +using CUT loss within the DDS framework. To calculate this loss, instead of +employing auxiliary networks, we utilize the intermediate features of LDM, in +particular, those from the self-attention layers, which possesses rich spatial +information. Our approach enables zero-shot image-to-image translation and +neural radiance field (NeRF) editing, achieving a well-balanced interplay +between maintaining the structural details and transforming content. +Qualitative results and comparisons demonstrates the effectiveness of our +proposed method. Project page with code is available at +https://hyelinnam.github.io/CDS/. + +
+
+ comment: Project page: https://hyelinnam.github.io/CDS/ +
+
+
+
+
+ + ☆ Generalisable Agents for Neural Network Optimisation NeurIPS 2023 + + +
+ Optimising deep neural networks is a challenging task due to complex training +dynamics, high computational requirements, and long training times. To address +this difficulty, we propose the framework of Generalisable Agents for Neural +Network Optimisation (GANNO) -- a multi-agent reinforcement learning (MARL) +approach that learns to improve neural network optimisation by dynamically and +responsively scheduling hyperparameters during training. GANNO utilises an +agent per layer that observes localised network dynamics and accordingly takes +actions to adjust these dynamics at a layerwise level to collectively improve +global performance. In this paper, we use GANNO to control the layerwise +learning rate and show that the framework can yield useful and responsive +schedules that are competitive with handcrafted heuristics. Furthermore, GANNO +is shown to perform robustly across a wide variety of unseen initial +conditions, and can successfully generalise to harder problems than it was +trained on. Our work presents an overview of the opportunities that this +paradigm offers for training neural networks, along with key challenges that +remain to be overcome. + +
+
+ comment: Accepted at the Workshop on Advanced Neural Network Training (WANT) + and Optimization for Machine Learning (OPT) at NeurIPS 2023 +
+
+
+
+
+ + ☆ Optimizing ZX-Diagrams with Deep Reinforcement Learning + + +
+ ZX-diagrams are a powerful graphical language for the description of quantum +processes with applications in fundamental quantum mechanics, quantum circuit +optimization, tensor network simulation, and many more. The utility of +ZX-diagrams relies on a set of local transformation rules that can be applied +to them without changing the underlying quantum process they describe. These +rules can be exploited to optimize the structure of ZX-diagrams for a range of +applications. However, finding an optimal sequence of transformation rules is +generally an open problem. In this work, we bring together ZX-diagrams with +reinforcement learning, a machine learning technique designed to discover an +optimal sequence of actions in a decision-making problem and show that a +trained reinforcement learning agent can significantly outperform other +optimization techniques like a greedy strategy or simulated annealing. The use +of graph neural networks to encode the policy of the agent enables +generalization to diagrams much bigger than seen during the training phase. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Continuous 16-bit Training: Accelerating 32-bit Pre-Trained Neural + Networks + + +
+ In the field of deep learning, the prevalence of models initially trained +with 32-bit precision is a testament to its robustness and accuracy. However, +the continuous evolution of these models often demands further training, which +can be resource-intensive. This study introduces a novel approach where we +continue the training of these pre-existing 32-bit models using 16-bit +precision. This technique not only caters to the need for efficiency in +computational resources but also significantly improves the speed of additional +training phases. By adopting 16-bit precision for ongoing training, we are able +to substantially decrease memory requirements and computational burden, thereby +accelerating the training process in a resource-limited setting. Our +experiments show that this method maintains the high standards of accuracy set +by the original 32-bit training while providing a much-needed boost in training +speed. This approach is especially pertinent in today's context, where most +models are initially trained in 32-bit and require periodic updates and +refinements. The findings from our research suggest that this strategy of +16-bit continuation training can be a key solution for sustainable and +efficient deep learning, offering a practical way to enhance pre-trained models +rapidly and in a resource-conscious manner. + +
+
+
+
+
+ + ☆ Communication-Efficient Heterogeneous Federated Learning with + Generalized Heavy-Ball Momentum + + +
+ Federated Learning (FL) is the state-of-the-art approach for learning from +decentralized data in privacy-constrained scenarios. As the current literature +reports, the main problems associated with FL refer to system and statistical +challenges: the former ones demand for efficient learning from edge devices, +including lowering communication bandwidth and frequency, while the latter +require algorithms robust to non-iidness. State-of-art approaches either +guarantee convergence at increased communication cost or are not sufficiently +robust to handle extreme heterogeneous local distributions. In this work we +propose a novel generalization of the heavy-ball momentum, and present FedHBM +to effectively address statistical heterogeneity in FL without introducing any +communication overhead. We conduct extensive experimentation on common FL +vision and NLP datasets, showing that our FedHBM algorithm empirically yields +better model quality and higher convergence speed w.r.t. the state-of-art, +especially in pathological non-iid scenarios. While being designed for +cross-silo settings, we show how FedHBM is applicable in moderate-to-high +cross-device scenarios, and how good model initializations (e.g. pre-training) +can be exploited for prompt acceleration. Extended experimentation on +large-scale real-world federated datasets further corroborates the +effectiveness of our approach for real-world FL applications. + +
+
+
+
+
+ + ☆ Class Distribution Shifts in Zero-Shot Learning: Learning Robust + Representations + + +
+ Distribution shifts between training and deployment data often affect the +performance of machine learning models. In this paper, we explore a setting +where a hidden variable induces a shift in the distribution of classes. These +distribution shifts are particularly challenging for zero-shot classifiers, as +they rely on representations learned from training classes, but are deployed on +new, unseen ones. We introduce an algorithm to learn data representations that +are robust to such class distribution shifts in zero-shot verification tasks. +We show that our approach, which combines hierarchical data sampling with +out-of-distribution generalization techniques, improves generalization to +diverse class distributions in both simulations and real-world datasets. + +
+
+
+
+
+ + ☆ Multi-scale Iterative Refinement towards Robust and Versatile Molecular + Docking + + +
+ Molecular docking is a key computational tool utilized to predict the binding +conformations of small molecules to protein targets, which is fundamental in +the design of novel drugs. Despite recent advancements in geometric deep +learning-based approaches leading to improvements in blind docking efficiency, +these methods have encountered notable challenges, such as limited +generalization performance on unseen proteins, the inability to concurrently +address the settings of blind docking and site-specific docking, and the +frequent occurrence of physical implausibilities such as inter-molecular steric +clash. In this study, we introduce DeltaDock, a robust and versatile framework +designed for efficient molecular docking to overcome these challenges. +DeltaDock operates in a two-step process: rapid initial complex structures +sampling followed by multi-scale iterative refinement of the initial +structures. In the initial stage, to sample accurate structures with high +efficiency, we develop a ligand-dependent binding site prediction model founded +on large protein models and graph neural networks. This model is then paired +with GPU-accelerated sampling algorithms. The sampled structures are updated +using a multi-scale iterative refinement module that captures both +protein-ligand atom-atom interactions and residue-atom interactions in the +following stage. Distinct from previous geometric deep learning methods that +are conditioned on the blind docking setting, DeltaDock demonstrates superior +performance in both blind docking and site-specific docking settings. +Comprehensive experimental results reveal that DeltaDock consistently surpasses +baseline methods in terms of docking accuracy. Furthermore, it displays +remarkable generalization capabilities and proficiency for predicting +physically valid structures, thereby attesting to its robustness and +reliability in various scenarios. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ FediOS: Decoupling Orthogonal Subspaces for Personalization in + Feature-skew Federated Learning + + +
+ Personalized federated learning (pFL) enables collaborative training among +multiple clients to enhance the capability of customized local models. In pFL, +clients may have heterogeneous (also known as non-IID) data, which poses a key +challenge in how to decouple the data knowledge into generic knowledge for +global sharing and personalized knowledge for preserving local personalization. +A typical way of pFL focuses on label distribution skew, and they adopt a +decoupling scheme where the model is split into a common feature extractor and +two prediction heads (generic and personalized). However, such a decoupling +scheme cannot solve the essential problem of feature skew heterogeneity, +because a common feature extractor cannot decouple the generic and personalized +features. Therefore, in this paper, we rethink the architecture decoupling +design for feature-skew pFL and propose an effective pFL method called FediOS. +In FediOS, we reformulate the decoupling into two feature extractors (generic +and personalized) and one shared prediction head. Orthogonal projections are +used for clients to map the generic features into one common subspace and +scatter the personalized features into different subspaces to achieve +decoupling for them. In addition, a shared prediction head is trained to +balance the importance of generic and personalized features during inference. +Extensive experiments on four vision datasets demonstrate our method reaches +state-of-the-art pFL performances under feature skew heterogeneity. + +
+
+
+
+
+ + ☆ Learning Radio Environments by Differentiable Ray Tracing + + +
+ Ray tracing (RT) is instrumental in 6G research in order to generate +spatially-consistent and environment-specific channel impulse responses (CIRs). +While acquiring accurate scene geometries is now relatively straightforward, +determining material characteristics requires precise calibration using channel +measurements. We therefore introduce a novel gradient-based calibration method, +complemented by differentiable parametrizations of material properties, +scattering and antenna patterns. Our method seamlessly integrates with +differentiable ray tracers that enable the computation of derivatives of CIRs +with respect to these parameters. Essentially, we approach field computation as +a large computational graph wherein parameters are trainable akin to weights of +a neural network (NN). We have validated our method using both synthetic data +and real-world indoor channel measurements, employing a distributed +multiple-input multiple-output (MIMO) channel sounder. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ☆ Can semi-supervised learning use all the data effectively? A lower bound + perspective + + +
+ Prior works have shown that semi-supervised learning algorithms can leverage +unlabeled data to improve over the labeled sample complexity of supervised +learning (SL) algorithms. However, existing theoretical analyses focus on +regimes where the unlabeled data is sufficient to learn a good decision +boundary using unsupervised learning (UL) alone. This begs the question: Can +SSL algorithms simultaneously improve upon both UL and SL? To this end, we +derive a tight lower bound for 2-Gaussian mixture models that explicitly +depends on the labeled and the unlabeled dataset size as well as the +signal-to-noise ratio of the mixture distribution. Surprisingly, our result +implies that no SSL algorithm can improve upon the minimax-optimal statistical +error rates of SL or UL algorithms for these distributions. Nevertheless, we +show empirically on real-world data that SSL algorithms can still outperform UL +and SL methods. Therefore, our work suggests that, while proving performance +gains for SSL algorithms is possible, it requires careful tracking of +constants. + +
+
+ comment: Published in Advances in Neural Information Processing Systems 2023 +
+
+
+
+
+ + ☆ Heterogeneous Graph-based Trajectory Prediction using Local Map Context + and Social Interactions SC 2023 + + +
+ Precisely predicting the future trajectories of surrounding traffic +participants is a crucial but challenging problem in autonomous driving, due to +complex interactions between traffic agents, map context and traffic rules. +Vector-based approaches have recently shown to achieve among the best +performances on trajectory prediction benchmarks. These methods model simple +interactions between traffic agents but don't distinguish between relation-type +and attributes like their distance along the road. Furthermore, they represent +lanes only by sequences of vectors representing center lines and ignore context +information like lane dividers and other road elements. We present a novel +approach for vector-based trajectory prediction that addresses these +shortcomings by leveraging three crucial sources of information: First, we +model interactions between traffic agents by a semantic scene graph, that +accounts for the nature and important features of their relation. Second, we +extract agent-centric image-based map features to model the local map context. +Finally, we generate anchor paths to enforce the policy in multi-modal +prediction to permitted trajectories only. Each of these three enhancements +shows advantages over the baseline model HoliGraph. + +
+
+ comment: Accepted on IEEE ITSC 2023 +
+
+
+
+
+ + ☆ Real-Time Vibration-Based Bearing Fault Diagnosis Under Time-Varying + Speed Conditions + + +
+ Detection of rolling-element bearing faults is crucial for implementing +proactive maintenance strategies and for minimizing the economic and +operational consequences of unexpected failures. However, many existing +techniques are developed and tested under strictly controlled conditions, +limiting their adaptability to the diverse and dynamic settings encountered in +practical applications. This paper presents an efficient real-time +convolutional neural network (CNN) for diagnosing multiple bearing faults under +various noise levels and time-varying rotational speeds. Additionally, we +propose a novel Fisher-based spectral separability analysis (SSA) method to +elucidate the effectiveness of the designed CNN model. We conducted experiments +on both healthy bearings and bearings afflicted with inner race, outer race, +and roller ball faults. The experimental results show the superiority of our +model over the current state-of-the-art approach in three folds: it achieves +substantial accuracy gains of up to 15.8%, it is robust to noise with high +performance across various signal-to-noise ratios, and it runs in real-time +with processing durations five times less than acquisition. Additionally, by +using the proposed SSA technique, we offer insights into the model's +performance and underscore its effectiveness in tackling real-world challenges. + +
+
+
+
+
+ + ☆ Match me if you can: Semantic Correspondence Learning with Unpaired + Images + + +
+ Recent approaches for semantic correspondence have focused on obtaining +high-quality correspondences using a complicated network, refining the +ambiguous or noisy matching points. Despite their performance improvements, +they remain constrained by the limited training pairs due to costly point-level +annotations. This paper proposes a simple yet effective method that performs +training with unlabeled pairs to complement both limited image pairs and sparse +point pairs, requiring neither extra labeled keypoints nor trainable modules. +We fundamentally extend the data quantity and variety by augmenting new +unannotated pairs not primitively provided as training pairs in benchmarks. +Using a simple teacher-student framework, we offer reliable pseudo +correspondences to the student network via machine supervision. Finally, the +performance of our network is steadily improved by the proposed iterative +training, putting back the student as a teacher to generate refined labels and +train a new student repeatedly. Our models outperform the milestone baselines, +including state-of-the-art methods on semantic correspondence benchmarks. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Dataset Distillation via the Wasserstein Metric + + +
+ Dataset distillation (DD) offers a compelling approach in computer vision, +with the goal of condensing extensive datasets into smaller synthetic versions +without sacrificing much of the model performance. In this paper, we continue +to study the methods for DD, by addressing its conceptually core objective: how +to capture the essential representation of extensive datasets in smaller, +synthetic forms. + We propose a novel approach utilizing the Wasserstein distance, a metric +rooted in optimal transport theory, to enhance distribution matching in DD. Our +method leverages the Wasserstein barycenter, offering a geometrically +meaningful way to quantify distribution differences and effectively capture the +centroid of a set of distributions. Our approach retains the computational +benefits of distribution matching-based methods while achieving new +state-of-the-art performance on several benchmarks. + To provide useful prior for learning the images, we embed the synthetic data +into the feature space of pretrained classification models to conduct +distribution matching. Extensive testing on various high-resolution datasets +confirms the effectiveness and adaptability of our method, indicating the +promising yet unexplored capabilities of Wasserstein metrics in dataset +distillation. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ HOT: Higher-Order Dynamic Graph Representation Learning with Efficient + Transformers + + +
+ Many graph representation learning (GRL) problems are dynamic, with millions +of edges added or removed per second. A fundamental workload in this setting is +dynamic link prediction: using a history of graph updates to predict whether a +given pair of vertices will become connected. Recent schemes for link +prediction in such dynamic settings employ Transformers, modeling individual +graph updates as single tokens. In this work, we propose HOT: a model that +enhances this line of works by harnessing higher-order (HO) graph structures; +specifically, k-hop neighbors and more general subgraphs containing a given +pair of vertices. Harnessing such HO structures by encoding them into the +attention matrix of the underlying Transformer results in higher accuracy of +link prediction outcomes, but at the expense of increased memory pressure. To +alleviate this, we resort to a recent class of schemes that impose hierarchy on +the attention matrix, significantly reducing memory footprint. The final design +offers a sweetspot between high accuracy and low memory utilization. HOT +outperforms other dynamic GRL schemes, for example achieving 9%, 7%, and 15% +higher accuracy than - respectively - DyGFormer, TGN, and GraphMixer, for the +MOOC dataset. Our design can be seamlessly extended towards other dynamic GRL +workloads. + +
+
+
+
+
+ + ☆ Detecting Anomalous Network Communication Patterns Using Graph + Convolutional Networks + + +
+ To protect an organizations' endpoints from sophisticated cyberattacks, +advanced detection methods are required. In this research, we present +GCNetOmaly: a graph convolutional network (GCN)-based variational autoencoder +(VAE) anomaly detector trained on data that include connection events among +internal and external machines. As input, the proposed GCN-based VAE model +receives two matrices: (i) the normalized adjacency matrix, which represents +the connections among the machines, and (ii) the feature matrix, which includes +various features (demographic, statistical, process-related, and Node2vec +structural features) that are used to profile the individual nodes/machines. +After training the model on data collected for a predefined time window, the +model is applied on the same data; the reconstruction score obtained by the +model for a given machine then serves as the machine's anomaly score. +GCNetOmaly was evaluated on real, large-scale data logged by Carbon Black EDR +from a large financial organization's automated teller machines (ATMs) as well +as communication with Active Directory (AD) servers in two setups: unsupervised +and supervised. The results of our evaluation demonstrate GCNetOmaly's +effectiveness in detecting anomalous behavior of machines on unsupervised data. + +
+
+
+
+
+ + ☆ Combining deep generative models with extreme value theory for synthetic + hazard simulation: a multivariate and spatially coherent approach NeurIPS 2023 + + +
+ Climate hazards can cause major disasters when they occur simultaneously as +compound hazards. To understand the distribution of climate risk and inform +adaptation policies, scientists need to simulate a large number of physically +realistic and spatially coherent events. Current methods are limited by +computational constraints and the probabilistic spatial distribution of +compound events is not given sufficient attention. The bottleneck in current +approaches lies in modelling the dependence structure between variables, as +inference on parametric models suffers from the curse of dimensionality. +Generative adversarial networks (GANs) are well-suited to such a problem due to +their ability to implicitly learn the distribution of data in high-dimensional +settings. We employ a GAN to model the dependence structure for daily maximum +wind speed, significant wave height, and total precipitation over the Bay of +Bengal, combining this with traditional extreme value theory for controlled +extrapolation of the tails. Once trained, the model can be used to efficiently +generate thousands of realistic compound hazard events, which can inform +climate risk assessments for climate adaptation and disaster preparedness. The +method developed is flexible and transferable to other multivariate and spatial +climate datasets. + +
+
+ comment: Accepted at NeurIPS 2023 Workshop: Tackling Climate Change with + Machine Learning (CCAI) +
+
+
+
+
+ + ☆ Calibration-free online test-time adaptation for electroencephalography + motor imagery decoding + + +
+ Providing a promising pathway to link the human brain with external devices, +Brain-Computer Interfaces (BCIs) have seen notable advancements in decoding +capabilities, primarily driven by increasingly sophisticated techniques, +especially deep learning. However, achieving high accuracy in real-world +scenarios remains a challenge due to the distribution shift between sessions +and subjects. In this paper we will explore the concept of online test-time +adaptation (OTTA) to continuously adapt the model in an unsupervised fashion +during inference time. Our approach guarantees the preservation of privacy by +eliminating the requirement to access the source data during the adaptation +process. Additionally, OTTA achieves calibration-free operation by not +requiring any session- or subject-specific data. We will investigate the task +of electroencephalography (EEG) motor imagery decoding using a lightweight +architecture together with different OTTA techniques like alignment, adaptive +batch normalization, and entropy minimization. We examine two datasets and +three distinct data settings for a comprehensive analysis. Our adaptation +methods produce state-of-the-art results, potentially instigating a shift in +transfer learning for BCI decoding towards online adaptation. + +
+
+ comment: 6 pages, 4 figures, submitted to: 12th International Winter + Conference on Brain-Computer Interface 2024 +
+
+
+
+
+ + ☆ Revisiting Proposal-based Object Detection + + +
+ This paper revisits the pipeline for detecting objects in images with +proposals. For any object detector, the obtained box proposals or queries need +to be classified and regressed towards ground truth boxes. The common solution +for the final predictions is to directly maximize the overlap between each +proposal and the ground truth box, followed by a winner-takes-all ranking or +non-maximum suppression. In this work, we propose a simple yet effective +alternative. For proposal regression, we solve a simpler problem where we +regress to the area of intersection between proposal and ground truth. In this +way, each proposal only specifies which part contains the object, avoiding a +blind inpainting problem where proposals need to be regressed beyond their +visual scope. In turn, we replace the winner-takes-all strategy and obtain the +final prediction by taking the union over the regressed intersections of a +proposal group surrounding an object. Our revisited approach comes with minimal +changes to the detection pipeline and can be plugged into any existing method. +We show that our approach directly improves canonical object detection and +instance segmentation architectures, highlighting the utility of +intersection-based regression and grouping. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Global Convergence of Online Identification for Mixed Linear Regression + + +
+ Mixed linear regression (MLR) is a powerful model for characterizing +nonlinear relationships by utilizing a mixture of linear regression sub-models. +The identification of MLR is a fundamental problem, where most of the existing +results focus on offline algorithms, rely on independent and identically +distributed (i.i.d) data assumptions, and provide local convergence results +only. This paper investigates the online identification and data clustering +problems for two basic classes of MLRs, by introducing two corresponding new +online identification algorithms based on the expectation-maximization (EM) +principle. It is shown that both algorithms will converge globally without +resorting to the traditional i.i.d data assumptions. The main challenge in our +investigation lies in the fact that the gradient of the maximum likelihood +function does not have a unique zero, and a key step in our analysis is to +establish the stability of the corresponding differential equation in order to +apply the celebrated Ljung's ODE method. It is also shown that the +within-cluster error and the probability that the new data is categorized into +the correct cluster are asymptotically the same as those in the case of known +parameters. Finally, numerical simulations are provided to verify the +effectiveness of our online algorithms. + +
+
+
+
+
+ + ☆ Data-Agnostic Model Poisoning against Federated Learning: A Graph + Autoencoder Approach + + +
+ This paper proposes a novel, data-agnostic, model poisoning attack on +Federated Learning (FL), by designing a new adversarial graph autoencoder +(GAE)-based framework. The attack requires no knowledge of FL training data and +achieves both effectiveness and undetectability. By listening to the benign +local models and the global model, the attacker extracts the graph structural +correlations among the benign local models and the training data features +substantiating the models. The attacker then adversarially regenerates the +graph structural correlations while maximizing the FL training loss, and +subsequently generates malicious local models using the adversarial graph +structure and the training data features of the benign ones. A new algorithm is +designed to iteratively train the malicious local models using GAE and +sub-gradient descent. The convergence of FL under attack is rigorously proved, +with a considerably large optimality gap. Experiments show that the FL accuracy +drops gradually under the proposed attack and existing defense mechanisms fail +to detect it. The attack can give rise to an infection across all benign +devices, making it a serious threat to FL. + +
+
+ comment: 15 pages, 10 figures, submitted to IEEE Transactions on Information + Forensics and Security (TIFS) +
+
+
+
+
+ + ☆ Improving Adversarial Transferability via Model Alignment + + +
+ Neural networks are susceptible to adversarial perturbations that are +transferable across different models. In this paper, we introduce a novel model +alignment technique aimed at improving a given source model's ability in +generating transferable adversarial perturbations. During the alignment +process, the parameters of the source model are fine-tuned to minimize an +alignment loss. This loss measures the divergence in the predictions between +the source model and another, independently trained model, referred to as the +witness model. To understand the effect of model alignment, we conduct a +geometric anlaysis of the resulting changes in the loss landscape. Extensive +experiments on the ImageNet dataset, using a variety of model architectures, +demonstrate that perturbations generated from aligned source models exhibit +significantly higher transferability than those from the original source model. + +
+
+
+
+
+ + ☆ ZeST-NeRF: Using temporal aggregation for Zero-Shot Temporal NeRFs BMVC 2023 + + +
+ In the field of media production, video editing techniques play a pivotal +role. Recent approaches have had great success at performing novel view image +synthesis of static scenes. But adding temporal information adds an extra layer +of complexity. Previous models have focused on implicitly representing static +and dynamic scenes using NeRF. These models achieve impressive results but are +costly at training and inference time. They overfit an MLP to describe the +scene implicitly as a function of position. This paper proposes ZeST-NeRF, a +new approach that can produce temporal NeRFs for new scenes without retraining. +We can accurately reconstruct novel views using multi-view synthesis techniques +and scene flow-field estimation, trained only with unrelated scenes. We +demonstrate how existing state-of-the-art approaches from a range of fields +cannot adequately solve this new task and demonstrate the efficacy of our +solution. The resulting network improves quantitatively by 15% and produces +significantly better visual results. + +
+
+ comment: VUA BMVC 2023 +
+
+
+
+
+ + ☆ Causal Fairness under Unobserved Confounding: A Neural Sensitivity + Framework + + +
+ Fairness for machine learning predictions is widely required in practice for +legal, ethical, and societal reasons. Existing work typically focuses on +settings without unobserved confounding, even though unobserved confounding can +lead to severe violations of causal fairness and, thus, unfair predictions. In +this work, we analyze the sensitivity of causal fairness to unobserved +confounding. Our contributions are three-fold. First, we derive bounds for +causal fairness metrics under different sources of unobserved confounding. This +enables practitioners to examine the sensitivity of their machine learning +models to unobserved confounding in fairness-critical applications. Second, we +propose a novel neural framework for learning fair predictions, which allows us +to offer worst-case guarantees of the extent to which causal fairness can be +violated due to unobserved confounding. Third, we demonstrate the effectiveness +of our framework in a series of experiments, including a real-world case study +about predicting prison sentences. To the best of our knowledge, ours is the +first work to study causal fairness under unobserved confounding. To this end, +our work is of direct practical value as a refutation strategy to ensure the +fairness of predictions in high-stakes applications. + +
+
+
+
+
+ + ☆ How Much Is Hidden in the NAS Benchmarks? Few-Shot Adaptation of a NAS + Predictor + + +
+ Neural architecture search has proven to be a powerful approach to designing +and refining neural networks, often boosting their performance and efficiency +over manually-designed variations, but comes with computational overhead. While +there has been a considerable amount of research focused on lowering the cost +of NAS for mainstream tasks, such as image classification, a lot of those +improvements stem from the fact that those tasks are well-studied in the +broader context. Consequently, applicability of NAS to emerging and +under-represented domains is still associated with a relatively high cost +and/or uncertainty about the achievable gains. To address this issue, we turn +our focus towards the recent growth of publicly available NAS benchmarks in an +attempt to extract general NAS knowledge, transferable across different tasks +and search spaces. We borrow from the rich field of meta-learning for few-shot +adaptation and carefully study applicability of those methods to NAS, with a +special focus on the relationship between task-level correlation (domain shift) +and predictor transferability; which we deem critical for improving NAS on +diverse tasks. In our experiments, we use 6 NAS benchmarks in conjunction, +spanning in total 16 NAS settings -- our meta-learning approach not only shows +superior (or matching) performance in the cross-validation experiments but also +successful extrapolation to a new search space and tasks. + +
+
+
+
+
+ + ☆ The Sliding Regret in Stochastic Bandits: Discriminating Index and + Randomized Policies + + +
+ This paper studies the one-shot behavior of no-regret algorithms for +stochastic bandits. Although many algorithms are known to be asymptotically +optimal with respect to the expected regret, over a single run, their +pseudo-regret seems to follow one of two tendencies: it is either smooth or +bumpy. To measure this tendency, we introduce a new notion: the sliding regret, +that measures the worst pseudo-regret over a time-window of fixed length +sliding to infinity. We show that randomized methods (e.g. Thompson Sampling +and MED) have optimal sliding regret, while index policies, although possibly +asymptotically optimal for the expected regret, have the worst possible sliding +regret under regularity conditions on their index (e.g. UCB, UCB-V, KL-UCB, +MOSS, IMED etc.). We further analyze the average bumpiness of the pseudo-regret +of index policies via the regret of exploration, that we show to be suboptimal +as well. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ☆ Exploring the Temperature-Dependent Phase Transition in Modern Hopfield + Networks NeurIPS23 + + +
+ The recent discovery of a connection between Transformers and Modern Hopfield +Networks (MHNs) has reignited the study of neural networks from a physical +energy-based perspective. This paper focuses on the pivotal effect of the +inverse temperature hyperparameter $\beta$ on the distribution of energy minima +of the MHN. To achieve this, the distribution of energy minima is tracked in a +simplified MHN in which equidistant normalised patterns are stored. This +network demonstrates a phase transition at a critical temperature +$\beta_{\text{c}}$, from a single global attractor towards highly pattern +specific minima as $\beta$ is increased. Importantly, the dynamics are not +solely governed by the hyperparameter $\beta$ but are instead determined by an +effective inverse temperature $\beta_{\text{eff}}$ which also depends on the +distribution and size of the stored patterns. Recognizing the role of +hyperparameters in the MHN could, in the future, aid researchers in the domain +of Transformers to optimise their initial choices, potentially reducing the +necessity for time and energy expensive hyperparameter fine-tuning. + +
+
+ comment: Accepted as poster for Associative Memory and Hopfield Networks + workshop at NeurIPS23 +
+
+
+
+
+ + ☆ On the convergence of adaptive first order methods: proximal gradient + and alternating minimization algorithms + + +
+ Building upon recent works on linesearch-free adaptive proximal gradient +methods, this paper proposes AdaPG$^{\pi,r}$, a framework that unifies and +extends existing results by providing larger stepsize policies and improved +lower bounds. Different choices of the parameters $\pi$ and $r$ are discussed +and the efficacy of the resulting methods is demonstrated through numerical +simulations. In an attempt to better understand the underlying theory, its +convergence is established in a more general setting that allows for +time-varying parameters. Finally, an adaptive alternating minimization +algorithm is presented by exploring the dual setting. This algorithm not only +incorporates additional adaptivity, but also expands its applicability beyond +standard strongly convex settings. + +
+
+
+
+
+ + ☆ Convergence Analysis of Fractional Gradient Descent + + +
+ Fractional derivatives are a well-studied generalization of integer order +derivatives. Naturally, for optimization, it is of interest to understand the +convergence properties of gradient descent using fractional derivatives. +Convergence analysis of fractional gradient descent is currently limited both +in the methods analyzed and the settings analyzed. This paper aims to fill in +these gaps by analyzing variations of fractional gradient descent in smooth and +convex, smooth and strongly convex, and smooth and non-convex settings. First, +novel bounds will be established bridging fractional and integer derivatives. +Then, these bounds will be applied to the aforementioned settings to prove +$O(1/T)$ convergence for smooth and convex functions and linear convergence for +smooth and strongly convex functions. Additionally, we prove $O(1/T)$ +convergence for smooth and non-convex functions using an extended notion of +smoothness that is more natural for fractional derivatives. Finally, empirical +results will be presented on the potential speed up of fractional gradient +descent over standard gradient descent as well as the challenges of predicting +which will be faster in general. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ☆ RainAI -- Precipitation Nowcasting from Satellite Data + + +
+ This paper presents a solution to the Weather4Cast 2023 competition, where +the goal is to forecast high-resolution precipitation with an 8-hour lead time +using lower-resolution satellite radiance images. We propose a simple, yet +effective method for spatiotemporal feature learning using a 2D U-Net model, +that outperforms the official 3D U-Net baseline in both performance and +efficiency. We place emphasis on refining the dataset, through importance +sampling and dataset preparation, and show that such techniques have a +significant impact on performance. We further study an alternative +cross-entropy loss function that improves performance over the standard mean +squared error loss, while also enabling models to produce probabilistic +outputs. Additional techniques are explored regarding the generation of +predictions at different lead times, specifically through Conditioning Lead +Time. Lastly, to generate high-resolution forecasts, we evaluate standard and +learned upsampling methods. The code and trained parameters are available at +https://github.com/rafapablos/w4c23-rainai. + +
+
+
+
+
+ + ☆ Data-efficient Deep Reinforcement Learning for Vehicle Trajectory + Control + + +
+ Advanced vehicle control is a fundamental building block in the development +of autonomous driving systems. Reinforcement learning (RL) promises to achieve +control performance superior to classical approaches while keeping +computational demands low during deployment. However, standard RL approaches +like soft-actor critic (SAC) require extensive amounts of training data to be +collected and are thus impractical for real-world application. To address this +issue, we apply recently developed data-efficient deep RL methods to vehicle +trajectory control. Our investigation focuses on three methods, so far +unexplored for vehicle control: randomized ensemble double Q-learning (REDQ), +probabilistic ensembles with trajectory sampling and model predictive path +integral optimizer (PETS-MPPI), and model-based policy optimization (MBPO). We +find that in the case of trajectory control, the standard model-based RL +formulation used in approaches like PETS-MPPI and MBPO is not suitable. We, +therefore, propose a new formulation that splits dynamics prediction and +vehicle localization. Our benchmark study on the CARLA simulator reveals that +the three identified data-efficient deep RL approaches learn control strategies +on a par with or better than SAC, yet reduce the required number of environment +interactions by more than one order of magnitude. + +
+
+
+
+
+ + ☆ On Exact Inversion of DPM-Solvers + + +
+ Diffusion probabilistic models (DPMs) are a key component in modern +generative models. DPM-solvers have achieved reduced latency and enhanced +quality significantly, but have posed challenges to find the exact inverse +(i.e., finding the initial noise from the given image). Here we investigate the +exact inversions for DPM-solvers and propose algorithms to perform them when +samples are generated by the first-order as well as higher-order DPM-solvers. +For each explicit denoising step in DPM-solvers, we formulated the inversions +using implicit methods such as gradient descent or forward step method to +ensure the robustness to large classifier-free guidance unlike the prior +approach using fixed-point iteration. Experimental results demonstrated that +our proposed exact inversion methods significantly reduced the error of both +image and noise reconstructions, greatly enhanced the ability to distinguish +invisible watermarks and well prevented unintended background changes +consistently during image editing. Project page: +\url{https://smhongok.github.io/inv-dpm.html}. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Transfer Learning across Different Chemical Domains: Virtual Screening + of Organic Materials with Deep Learning Models Pretrained on Small Molecule + and Chemical Reaction Data + + +
+ Machine learning prediction of organic materials properties is an efficient +virtual screening method ahead of more expensive screening methods. However, +this approach has suffered from insufficient labeled data on organic materials +to train state-of-the-art machine learning models. In this study, we +demonstrate that drug-like small molecule and chemical reaction databases can +be used to pretrain the BERT model for the virtual screening of organic +materials. Among the BERT models fine-tuned by five virtual screening tasks on +organic materials, the USPTO-SMILES pretrained BERT model had R2 > 0.90 for two +tasks and R2 > 0.82 for one, which was generally superior to the same models +pretrained by the small molecule or organic materials databases, as well as to +the other three traditional machine learning models trained directly on the +virtual screening task data. The superior performance of the USPTO-SMILES +pretrained BERT model is due to the greater variety of organic building blocks +in the USPTO database and the broader coverage of the chemical space. The even +better performance of the BERT model pretrained externally from a chemical +reaction database with additional sources of chemical reactions strengthens our +proof of concept that transfer learning across different chemical domains is +practical for the virtual screening of organic materials. + +
+
+
+
+
+ + ☆ Age Effects on Decision-Making, Drift Diffusion Model + + +
+ Training can improve human decision-making performance. After several +training sessions, a person can quickly and accurately complete a task. +However, decision-making is always a trade-off between accuracy and response +time. Factors such as age and drug abuse can affect the decision-making +process. This study examines how training can improve the performance of +different age groups in completing a random dot motion (RDM) task. The +participants are divided into two groups: old and young. They undergo a +three-phase training and then repeat the same RDM task. The hierarchical +drift-diffusion model analyzes the subjects' responses and determines how the +model's parameters change after training for both age groups. The results show +that after training, the participants were able to accumulate sensory +information faster, and the model drift rate increased. However, their decision +boundary decreased as they became more confident and had a lower +decision-making threshold. Additionally, the old group had a higher boundary +and lower drift rate in both pre and post-training, and there was less +difference between the two group parameters after training. + +
+
+
+
+
+ + ☆ Hubness Reduction Improves Sentence-BERT Semantic Spaces + + +
+ Semantic representations of text, i.e. representations of natural language +which capture meaning by geometry, are essential for areas such as information +retrieval and document grouping. High-dimensional trained dense vectors have +received much attention in recent years as such representations. We investigate +the structure of semantic spaces that arise from embeddings made with +Sentence-BERT and find that the representations suffer from a well-known +problem in high dimensions called hubness. Hubness results in asymmetric +neighborhood relations, such that some texts (the hubs) are neighbours of many +other texts while most texts (so-called anti-hubs), are neighbours of few or no +other texts. We quantify the semantic quality of the embeddings using hubness +scores and error rate of a neighbourhood based classifier. We find that when +hubness is high, we can reduce error rate and hubness using hubness reduction +methods. We identify a combination of two methods as resulting in the best +reduction. For example, on one of the tested pretrained models, this combined +method can reduce hubness by about 75% and error rate by about 9%. Thus, we +argue that mitigating hubness in the embedding space provides better semantic +representations of text. + +
+
+ comment: Accepted at NLDL 2024 +
+
+
+
+
+ + ☆ Towards Comparable Active Learning + + +
+ Active Learning has received significant attention in the field of machine +learning for its potential in selecting the most informative samples for +labeling, thereby reducing data annotation costs. However, we show that the +reported lifts in recent literature generalize poorly to other domains leading +to an inconclusive landscape in Active Learning research. Furthermore, we +highlight overlooked problems for reproducing AL experiments that can lead to +unfair comparisons and increased variance in the results. This paper addresses +these issues by providing an Active Learning framework for a fair comparison of +algorithms across different tasks and domains, as well as a fast and performant +oracle algorithm for evaluation. To the best of our knowledge, we propose the +first AL benchmark that tests algorithms in 3 major domains: Tabular, Image, +and Text. We report empirical results for 6 widely used algorithms on 7 +real-world and 2 synthetic datasets and aggregate them into a domain-specific +ranking of AL algorithms. + +
+
+
+
+
+ + ☆ Reconstructing Historical Climate Fields With Deep Learning + + +
+ Historical records of climate fields are often sparse due to missing +measurements, especially before the introduction of large-scale satellite +missions. Several statistical and model-based methods have been introduced to +fill gaps and reconstruct historical records. Here, we employ a recently +introduced deep-learning approach based on Fourier convolutions, trained on +numerical climate model output, to reconstruct historical climate fields. Using +this approach we are able to realistically reconstruct large and irregular +areas of missing data, as well as reconstruct known historical events such as +strong El Ni\~no and La Ni\~na with very little given information. Our method +outperforms the widely used statistical kriging method as well as other recent +machine learning approaches. The model generalizes to higher resolutions than +the ones it was trained on and can be used on a variety of climate fields. +Moreover, it allows inpainting of masks never seen before during the model +training. + +
+
+
+
+
+ + ☆ Learning Robust Precipitation Forecaster by Temporal Frame Interpolation + + +
+ Recent advancements in deep learning have propelled the field of weather +prediction models to new heights. Despite their progress, these models often +struggle with real-world application due to their sensitivity to +spatial-temporal shifts, a vulnerability particularly pronounced in weather +prediction tasks where overfitting to local and temporal variations is common. +This paper presents an investigation into the development of a robust +precipitation forecasting model that stands resilient to such shifts. We +introduce Temporal Frame Interpolation (TFI), an innovative technique designed +to fortify forecasting models against spatial-temporal discrepancies. TFI +operates by generating synthetic samples through the interpolation of adjacent +frames from satellite imagery and ground radar data, thereby enriching the +training dataset and bolstering the model's defense against noise on frames. +Additionally, we integrate a novel multi-level dice loss, which exploits the +ordinal nature of rainfall intensities to further refine model performance. +These methodologies have collectively advanced our model's forecasting +precision, achieving \textit{1st place} on the transfer learning leaderboard in +the \textit{Weather4Cast'23 competition}.It not only demonstrates the efficacy +of our approaches but also sets a new benchmark for deep learning applications +in meteorological forecasting. Our code and weights have been public on +\url{https://github.com/Secilia-Cxy/UNetTFI}. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2212.02968 by other authors +
+
+
+
+
+ + ☆ Learning for Semantic Knowledge Base-Guided Online Feature Transmission + in Dynamic Channels + + +
+ With the proliferation of edge computing, efficient AI inference on edge +devices has become essential for intelligent applications such as autonomous +vehicles and VR/AR. In this context, we address the problem of efficient remote +object recognition by optimizing feature transmission between mobile devices +and edge servers. We propose an online optimization framework to address the +challenge of dynamic channel conditions and device mobility in an end-to-end +communication system. Our approach builds upon existing methods by leveraging a +semantic knowledge base to drive multi-level feature transmission, accounting +for temporal factors and dynamic elements throughout the transmission process. +To solve the online optimization problem, we design a novel soft +actor-critic-based deep reinforcement learning system with a carefully designed +reward function for real-time decision-making, overcoming the optimization +difficulty of the NP-hard problem and achieving the minimization of semantic +loss while respecting latency constraints. Numerical results showcase the +superiority of our approach compared to traditional greedy methods under +various system setups. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Automatic Implementation of Neural Networks through Reaction Networks -- + Part I: Circuit Design and Convergence Analysis + + +
+ Information processing relying on biochemical interactions in the cellular +environment is essential for biological organisms. The implementation of +molecular computational systems holds significant interest and potential in the +fields of synthetic biology and molecular computation. This two-part article +aims to introduce a programmable biochemical reaction network (BCRN) system +endowed with mass action kinetics that realizes the fully connected neural +network (FCNN) and has the potential to act automatically in vivo. In part I, +the feedforward propagation computation, the backpropagation component, and all +bridging processes of FCNN are ingeniously designed as specific BCRN modules +based on their dynamics. This approach addresses a design gap in the +biochemical assignment module and judgment termination module and provides a +novel precise and robust realization of bi-molecular reactions for the learning +process. Through equilibrium approaching, we demonstrate that the designed BCRN +system achieves FCNN functionality with exponential convergence to target +computational results, thereby enhancing the theoretical support for such work. +Finally, the performance of this construction is further evaluated on two +typical logic classification problems. + +
+
+
+
+
+ + ☆ Categorical Traffic Transformer: Interpretable and Diverse Behavior + Prediction with Tokenized Latent + + +
+ Adept traffic models are critical to both planning and closed-loop simulation +for autonomous vehicles (AV), and key design objectives include accuracy, +diverse multimodal behaviors, interpretability, and downstream compatibility. +Recently, with the advent of large language models (LLMs), an additional +desirable feature for traffic models is LLM compatibility. We present +Categorical Traffic Transformer (CTT), a traffic model that outputs both +continuous trajectory predictions and tokenized categorical predictions (lane +modes, homotopies, etc.). The most outstanding feature of CTT is its fully +interpretable latent space, which enables direct supervision of the latent +variable from the ground truth during training and avoids mode collapse +completely. As a result, CTT can generate diverse behaviors conditioned on +different latent modes with semantic meanings while beating SOTA on prediction +accuracy. In addition, CTT's ability to input and output tokens enables +integration with LLMs for common-sense reasoning and zero-shot generalization. + +
+
+
+
+
+ + ☆ PAUNet: Precipitation Attention-based U-Net for rain prediction from + satellite radiance data + + +
+ This paper introduces Precipitation Attention-based U-Net (PAUNet), a deep +learning architecture for predicting precipitation from satellite radiance +data, addressing the challenges of the Weather4cast 2023 competition. PAUNet is +a variant of U-Net and Res-Net, designed to effectively capture the large-scale +contextual information of multi-band satellite images in visible, water vapor, +and infrared bands through encoder convolutional layers with center cropping +and attention mechanisms. We built upon the Focal Precipitation Loss including +an exponential component (e-FPL), which further enhanced the importance across +different precipitation categories, particularly medium and heavy rain. Trained +on a substantial dataset from various European regions, PAUNet demonstrates +notable accuracy with a higher Critical Success Index (CSI) score than the +baseline model in predicting rainfall over multiple time slots. PAUNet's +architecture and training methodology showcase improvements in precipitation +forecasting, crucial for sectors like emergency services and retail and supply +chain management. + +
+
+
+
+
+ + ☆ Semiparametric Efficient Inference in Adaptive Experiments + + +
+ We consider the problem of efficient inference of the Average Treatment +Effect in a sequential experiment where the policy governing the assignment of +subjects to treatment or control can change over time. We first provide a +central limit theorem for the Adaptive Augmented Inverse-Probability Weighted +estimator, which is semiparametric efficient, under weaker assumptions than +those previously made in the literature. This central limit theorem enables +efficient inference at fixed sample sizes. We then consider a sequential +inference setting, deriving both asymptotic and nonasymptotic confidence +sequences that are considerably tighter than previous methods. These +anytime-valid methods enable inference under data-dependent stopping times +(sample sizes). Additionally, we use propensity score truncation techniques +from the recent off-policy estimation literature to reduce the finite sample +variance of our estimator without affecting the asymptotic variance. Empirical +results demonstrate that our methods yield narrower confidence sequences than +those previously developed in the literature while maintaining time-uniform +error control. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ☆ Learning Exactly Linearizable Deep Dynamics Models + + +
+ Research on control using models based on machine-learning methods has now +shifted to the practical engineering stage. Achieving high performance and +theoretically guaranteeing the safety of the system is critical for such +applications. In this paper, we propose a learning method for exactly +linearizable dynamical models that can easily apply various control theories to +ensure stability, reliability, etc., and to provide a high degree of freedom of +expression. As an example, we present a design that combines simple linear +control and control barrier functions. The proposed model is employed for the +real-time control of an automotive engine, and the results demonstrate good +predictive performance and stable control under constraints. + +
+
+
+
+
+ + ☆ Consensus, dissensus and synergy between clinicians and specialist + foundation models in radiology report generation + + +
+ Radiology reports are an instrumental part of modern medicine, informing key +clinical decisions such as diagnosis and treatment. The worldwide shortage of +radiologists, however, restricts access to expert care and imposes heavy +workloads, contributing to avoidable errors and delays in report delivery. +While recent progress in automated report generation with vision-language +models offer clear potential in ameliorating the situation, the path to +real-world adoption has been stymied by the challenge of evaluating the +clinical quality of AI-generated reports. In this study, we build a +state-of-the-art report generation system for chest radiographs, Flamingo-CXR, +by fine-tuning a well-known vision-language foundation model on radiology data. +To evaluate the quality of the AI-generated reports, a group of 16 certified +radiologists provide detailed evaluations of AI-generated and human written +reports for chest X-rays from an intensive care setting in the United States +and an inpatient setting in India. At least one radiologist (out of two per +case) preferred the AI report to the ground truth report in over 60$\%$ of +cases for both datasets. Amongst the subset of AI-generated reports that +contain errors, the most frequently cited reasons were related to the location +and finding, whereas for human written reports, most mistakes were related to +severity and finding. This disparity suggested potential complementarity +between our AI system and human experts, prompting us to develop an assistive +scenario in which Flamingo-CXR generates a first-draft report, which is +subsequently revised by a clinician. This is the first demonstration of +clinician-AI collaboration for report writing, and the resultant reports are +assessed to be equivalent or preferred by at least one radiologist to reports +written by experts alone in 80$\%$ of in-patient cases and 66$\%$ of intensive +care cases. + +
+
+
+
+
+ + ☆ Diffusion Models Without Attention + + +
+ In recent advancements in high-fidelity image generation, Denoising Diffusion +Probabilistic Models (DDPMs) have emerged as a key player. However, their +application at high resolutions presents significant computational challenges. +Current methods, such as patchifying, expedite processes in UNet and +Transformer architectures but at the expense of representational capacity. +Addressing this, we introduce the Diffusion State Space Model (DiffuSSM), an +architecture that supplants attention mechanisms with a more scalable state +space model backbone. This approach effectively handles higher resolutions +without resorting to global compression, thus preserving detailed image +representation throughout the diffusion process. Our focus on FLOP-efficient +architectures in diffusion training marks a significant step forward. +Comprehensive evaluations on both ImageNet and LSUN datasets at two resolutions +demonstrate that DiffuSSMs are on par or even outperform existing diffusion +models with attention modules in FID and Inception Score metrics while +significantly reducing total FLOP usage. + +
+
+
+
+
+ + ☆ Navigating Privacy and Copyright Challenges Across the Data Lifecycle of + Generative AI + + +
+ The advent of Generative AI has marked a significant milestone in artificial +intelligence, demonstrating remarkable capabilities in generating realistic +images, texts, and data patterns. However, these advancements come with +heightened concerns over data privacy and copyright infringement, primarily due +to the reliance on vast datasets for model training. Traditional approaches +like differential privacy, machine unlearning, and data poisoning only offer +fragmented solutions to these complex issues. Our paper delves into the +multifaceted challenges of privacy and copyright protection within the data +lifecycle. We advocate for integrated approaches that combines technical +innovation with ethical foresight, holistically addressing these concerns by +investigating and devising solutions that are informed by the lifecycle +perspective. This work aims to catalyze a broader discussion and inspire +concerted efforts towards data privacy and copyright integrity in Generative +AI. + +
+
+
+
+
+ + ☆ Combined Scheduling, Memory Allocation and Tensor Replacement for + Minimizing Off-Chip Data Accesses of DNN Accelerators + + +
+ Specialized hardware accelerators have been extensively used for Deep Neural +Networks (DNNs) to provide power/performance benefits. These accelerators +contain specialized hardware that supports DNN operators, and scratchpad memory +for storing the tensor operands. Often, the size of the scratchpad is +insufficient to store all the tensors needed for the computation, and +additional data accesses are needed to move tensors back and forth from host +memory during the computation with significant power/performance overhead. The +volume of these additional data accesses depends on the operator schedule, and +memory allocation (specific locations selected for the tensors in the +scratchpad). We propose an optimization framework, named COSMA, for mapping +DNNs to an accelerator that finds the optimal operator schedule, memory +allocation and tensor replacement that minimizes the additional data accesses. +COSMA provides an Integer Linear Programming (ILP) formulation to generate the +optimal solution for mapping a DNN to the accelerator for a given scratchpad +size. We demonstrate that, using an off-the-shelf ILP solver, COSMA obtains the +optimal solution in seconds for a wide-range of state-of-the-art DNNs for +different applications. Further, it out-performs existing methods by reducing +on average 84% of the non-compulsory data accesses. We further propose a +divide-and-conquer heuristic to scale up to certain complex DNNs generated by +Neural Architecture Search, and this heuristic solution reduces on average 85% +data accesses compared with other works. + +
+
+
+
+
+ + ☆ Poisoning Attacks Against Contrastive Recommender Systems + + +
+ Contrastive learning (CL) has recently gained significant popularity in the +field of recommendation. Its ability to learn without heavy reliance on labeled +data is a natural antidote to the data sparsity issue. Previous research has +found that CL can not only enhance recommendation accuracy but also +inadvertently exhibit remarkable robustness against noise. However, this paper +identifies a vulnerability of CL-based recommender systems: Compared with their +non-CL counterparts, they are even more susceptible to poisoning attacks that +aim to promote target items. Our analysis points to the uniform dispersion of +representations led by the CL loss as the very factor that accounts for this +vulnerability. We further theoretically and empirically demonstrate that the +optimization of CL loss can lead to smooth spectral values of representations. +Based on these insights, we attempt to reveal the potential poisoning attacks +against CL-based recommender systems. The proposed attack encompasses a +dual-objective framework: One that induces a smoother spectral value +distribution to amplify the CL loss's inherent dispersion effect, named +dispersion promotion; and the other that directly elevates the visibility of +target items, named rank promotion. We validate the destructiveness of our +attack model through extensive experimentation on four datasets. By shedding +light on these vulnerabilities, we aim to facilitate the development of more +robust CL-based recommender systems. + +
+
+ comment: 14pages,6 figures,5 tables +
+
+
+
+
+ + ☆ DKiS: Decay weight invertible image steganography with private key + + +
+ Image steganography, the practice of concealing information within another +image, traditionally faces security challenges when its methods become publicly +known. To counteract this, we introduce a novel private key-based image +steganography technique. This approach ensures the security of hidden +information, requiring a corresponding private key for access, irrespective of +the public knowledge of the steganography method. We present experimental +evidence demonstrating our method's effectiveness, showcasing its real-world +applicability. Additionally, we identified a critical challenge in the +invertible image steganography process: the transfer of non-essential, or +`garbage', information from the secret to the host pipeline. To address this, +we introduced the decay weight to control the information transfer, filtering +out irrelevant data and enhancing the performance of image steganography. Our +code is publicly accessible at https://github.com/yanghangAI/DKiS, and a +practical demonstration is available at http://yanghang.site/hidekey. + +
+
+
+
+
+ + ☆ Label-efficient Training of Small Task-specific Models by Leveraging + Vision Foundation Models + + +
+ Large Vision Foundation Models (VFMs) pretrained on massive datasets exhibit +impressive performance on various downstream tasks, especially with limited +labeled target data. However, due to their high memory and compute +requirements, these models cannot be deployed in resource constrained settings. +This raises an important question: How can we utilize the knowledge from a +large VFM to train a small task-specific model for a new target task with +limited labeled training data? In this work, we answer this question by +proposing a simple and highly effective task-oriented knowledge transfer +approach to leverage pretrained VFMs for effective training of small +task-specific models. Our experimental results on four target tasks under +limited labeled data settings show that the proposed knowledge transfer +approach outperforms task-agnostic VFM distillation, web-scale CLIP pretraining +and supervised ImageNet pretraining by 1-10.5%, 2-22% and 2-14%, respectively. +We also show that the dataset used for transferring knowledge has a significant +effect on the final target task performance, and propose an image +retrieval-based approach for curating effective transfer sets. + +
+
+
+
+
+ + ☆ LMRL Gym: Benchmarks for Multi-Turn Reinforcement Learning with Language + Models + + +
+ Large language models (LLMs) provide excellent text-generation capabilities, +but standard prompting and generation methods generally do not lead to +intentional or goal-directed agents and might necessitate considerable prompt +tuning. This becomes particularly apparent in multi-turn conversations: even +the best current LLMs rarely ask clarifying questions, engage in explicit +information gathering, or take actions now that lead to better decisions after +multiple turns. Reinforcement learning has the potential to leverage the +powerful modeling capabilities of LLMs, as well as their internal +representation of textual interactions, to create capable goal-directed +language agents. This can enable intentional and temporally extended +interactions, such as with humans, through coordinated persuasion and carefully +crafted questions, or in goal-directed play through text games to bring about +desired final outcomes. However, enabling this requires the community to +develop stable and reliable reinforcement learning algorithms that can +effectively train LLMs. Developing such algorithms requires tasks that can +gauge progress on algorithm design, provide accessible and reproducible +evaluations for multi-turn interactions, and cover a range of task properties +and challenges in improving reinforcement learning algorithms. Our paper +introduces the LMRL-Gym benchmark for evaluating multi-turn RL for LLMs, +together with an open-source research framework containing a basic toolkit for +getting started on multi-turn RL with offline value-based and policy-based RL +methods. Our benchmark consists of 8 different language tasks, which require +multiple rounds of language interaction and cover a range of tasks in +open-ended dialogue and text games. + +
+
+
+
+
+ + ☆ Reasoning with the Theory of Mind for Pragmatic Semantic Communication + + +
+ In this paper, a pragmatic semantic communication framework that enables +effective goal-oriented information sharing between two-intelligent agents is +proposed. In particular, semantics is defined as the causal state that +encapsulates the fundamental causal relationships and dependencies among +different features extracted from data. The proposed framework leverages the +emerging concept in machine learning (ML) called theory of mind (ToM). It +employs a dynamic two-level (wireless and semantic) feedback mechanism to +continuously fine-tune neural network components at the transmitter. Thanks to +the ToM, the transmitter mimics the actual mental state of the receiver's +reasoning neural network operating semantic interpretation. Then, the estimated +mental state at the receiver is dynamically updated thanks to the proposed +dynamic two-level feedback mechanism. At the lower level, conventional channel +quality metrics are used to optimize the channel encoding process based on the +wireless communication channel's quality, ensuring an efficient mapping of +semantic representations to a finite constellation. Additionally, a semantic +feedback level is introduced, providing information on the receiver's perceived +semantic effectiveness with minimal overhead. Numerical evaluations demonstrate +the framework's ability to achieve efficient communication with a reduced +amount of bits while maintaining the same semantics, outperforming conventional +systems that do not exploit the ToM-based reasoning. + +
+
+
+
+
+ + ☆ SMaRt: Improving GANs with Score Matching Regularity + + +
+ Generative adversarial networks (GANs) usually struggle in learning from +highly diverse data, whose underlying manifold is complex. In this work, we +revisit the mathematical foundations of GANs, and theoretically reveal that the +native adversarial loss for GAN training is insufficient to fix the problem of +subsets with positive Lebesgue measure of the generated data manifold lying out +of the real data manifold. Instead, we find that score matching serves as a +valid solution to this issue thanks to its capability of persistently pushing +the generated data points towards the real data manifold. We thereby propose to +improve the optimization of GANs with score matching regularity (SMaRt). +Regarding the empirical evidences, we first design a toy example to show that +training GANs by the aid of a ground-truth score function can help reproduce +the real data distribution more accurately, and then confirm that our approach +can consistently boost the synthesis performance of various state-of-the-art +GANs on real-world datasets with pre-trained diffusion models acting as the +approximate score function. For instance, when training Aurora on the ImageNet +64x64 dataset, we manage to improve FID from 8.87 to 7.11, on par with the +performance of one-step consistency model. The source code will be made public. + +
+
+
+
+
+ + ☆ Towards Assessing and Benchmarking Risk-Return Tradeoff of Off-Policy + Evaluation + + +
+ Off-Policy Evaluation (OPE) aims to assess the effectiveness of +counterfactual policies using only offline logged data and is often used to +identify the top-k promising policies for deployment in online A/B tests. +Existing evaluation metrics for OPE estimators primarily focus on the +"accuracy" of OPE or that of downstream policy selection, neglecting +risk-return tradeoff in the subsequent online policy deployment. To address +this issue, we draw inspiration from portfolio evaluation in finance and +develop a new metric, called SharpeRatio@k, which measures the risk-return +tradeoff of policy portfolios formed by an OPE estimator under varying online +evaluation budgets (k). We validate our metric in two example scenarios, +demonstrating its ability to effectively distinguish between low-risk and +high-risk estimators and to accurately identify the most efficient estimator. +This efficient estimator is characterized by its capability to form the most +advantageous policy portfolios, maximizing returns while minimizing risks +during online deployment, a nuance that existing metrics typically overlook. To +facilitate a quick, accurate, and consistent evaluation of OPE via +SharpeRatio@k, we have also integrated this metric into an open-source +software, SCOPE-RL. Employing SharpeRatio@k and SCOPE-RL, we conduct +comprehensive benchmarking experiments on various estimators and RL tasks, +focusing on their risk-return tradeoff. These experiments offer several +interesting directions and suggestions for future OPE research. + +
+
+ comment: preprint, under review +
+
+
+
+
+ + ☆ SCOPE-RL: A Python Library for Offline Reinforcement Learning and + Off-Policy Evaluation + + +
+ This paper introduces SCOPE-RL, a comprehensive open-source Python software +designed for offline reinforcement learning (offline RL), off-policy evaluation +(OPE), and selection (OPS). Unlike most existing libraries that focus solely on +either policy learning or evaluation, SCOPE-RL seamlessly integrates these two +key aspects, facilitating flexible and complete implementations of both offline +RL and OPE processes. SCOPE-RL put particular emphasis on its OPE modules, +offering a range of OPE estimators and robust evaluation-of-OPE protocols. This +approach enables more in-depth and reliable OPE compared to other packages. For +instance, SCOPE-RL enhances OPE by estimating the entire reward distribution +under a policy rather than its mere point-wise expected value. Additionally, +SCOPE-RL provides a more thorough evaluation-of-OPE by presenting the +risk-return tradeoff in OPE results, extending beyond mere accuracy evaluations +in existing OPE literature. SCOPE-RL is designed with user accessibility in +mind. Its user-friendly APIs, comprehensive documentation, and a variety of +easy-to-follow examples assist researchers and practitioners in efficiently +implementing and experimenting with various offline RL methods and OPE +estimators, tailored to their specific problem contexts. The documentation of +SCOPE-RL is available at https://scope-rl.readthedocs.io/en/latest/. + +
+
+ comment: preprint, open-source software: + https://github.com/hakuhodo-technologies/scope-rl +
+
+
+
+
+ + ☆ Positional Information Matters for Invariant In-Context Learning: A Case + Study of Simple Function Classes + + +
+ In-context learning (ICL) refers to the ability of a model to condition on a +few in-context demonstrations (input-output examples of the underlying task) to +generate the answer for a new query input, without updating parameters. Despite +the impressive ICL ability of LLMs, it has also been found that ICL in LLMs is +sensitive to input demonstrations and limited to short context lengths. To +understand the limitations and principles for successful ICL, we conduct an +investigation with ICL linear regression of transformers. We characterize +several Out-of-Distribution (OOD) cases for ICL inspired by realistic LLM ICL +failures and compare transformers with DeepSet, a simple yet powerful +architecture for ICL. Surprisingly, DeepSet outperforms transformers across a +variety of distribution shifts, implying that preserving permutation invariance +symmetry to input demonstrations is crucial for OOD ICL. The phenomenon +specifies a fundamental requirement by ICL, which we termed as ICL invariance. +Nevertheless, the positional encodings in LLMs will break ICL invariance. To +this end, we further evaluate transformers with identical positional encodings +and find preserving ICL invariance in transformers achieves state-of-the-art +performance across various ICL distribution shifts + +
+
+ comment: Ongoing work; preliminary version +
+
+
+
+
+ + ☆ Toward the Tradeoffs between Privacy, Fairness and Utility in Federated + Learning + + +
+ Federated Learning (FL) is a novel privacy-protection distributed machine +learning paradigm that guarantees user privacy and prevents the risk of data +leakage due to the advantage of the client's local training. Researchers have +struggled to design fair FL systems that ensure fairness of results. However, +the interplay between fairness and privacy has been less studied. Increasing +the fairness of FL systems can have an impact on user privacy, while an +increase in user privacy can affect fairness. In this work, on the client side, +we use fairness metrics, such as Demographic Parity (DemP), Equalized Odds +(EOs), and Disparate Impact (DI), to construct the local fair model. To protect +the privacy of the client model, we propose a privacy-protection fairness FL +method. The results show that the accuracy of the fair model with privacy +increases because privacy breaks the constraints of the fairness metrics. In +our experiments, we conclude the relationship between privacy, fairness and +utility, and there is a tradeoff between these. + +
+
+ comment: 17 pages, 3 figures, conference +
+
+
+
+
+ + ☆ Leveraging cache to enable SLU on tiny devices + + +
+ This paper addresses spoken language understanding (SLU) on +microcontroller-like embedded devices, integrating on-device execution with +cloud offloading in a novel fashion. We exploit temporal locality in a device's +speech inputs and accordingly reuse recent SLU inferences. Our idea is simple: +let the device match new inputs against cached results, and only offload +unmatched inputs to the cloud for full inference. Realization of this idea, +however, is non-trivial: the device needs to compare acoustic features in a +robust, low-cost way. To this end, we present XYZ, a speech cache for tiny +devices. It matches speech inputs at two levels of representations: first by +clustered sequences of raw sound units, then as sequences of phonemes. Working +in tandem, the two representations offer complementary cost/accuracy tradeoffs. +To further boost accuracy, our cache is learning: with the mismatched and then +offloaded inputs, it continuously finetunes the device's feature extractors +(with the assistance of the cloud). We implement XYZ on an off-the-shelf STM32 +microcontroller. The resultant implementation has a small memory footprint of +2MB. Evaluated on challenging speech benchmarks, our system resolves 45%--90% +of inputs on device, reducing the average latency by up to 80% compared to +offloading to popular cloud speech services. Our benefit is pronounced even in +adversarial settings -- noisy environments, cold cache, or one device shared by +a number of users. + +
+
+ comment: submitted to Mobisys 2024 +
+
+
+
+
+ + ♻ ☆ Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis + + +
+ Hallucinations and unfaithful synthesis due to inaccurate prompts with +insufficient semantic details are widely observed in multimodal generative +models. A prevalent strategy to align multiple modalities is to fine-tune the +generator with a large number of annotated text-image pairs. However, such a +procedure is labor-consuming and resource-draining. The key question we ask is: +can we enhance the quality and faithfulness of text-driven generative models +beyond extensive text-image pair annotations? To address this question, we +propose Knowledge Pursuit Prompting (KPP), a zero-shot framework that +iteratively incorporates external knowledge to help generators produce reliable +visual content. Instead of training generators to handle generic prompts, KPP +employs a recursive knowledge query process to gather informative external +facts from the knowledge base, instructs a language model to compress the +acquired knowledge for prompt refinement, and utilizes text-driven generators +for visual synthesis. The entire process is zero-shot, without accessing the +architectures and parameters of generative models. We evaluate the framework +across multiple text-driven generative tasks (image, 3D rendering, and video) +on datasets of different domains. We further demonstrate the extensibility and +adaptability of KPP through varying foundation model bases and instructions. +Our results show that KPP is capable of generating faithful and semantically +rich content across diverse visual domains, offering a promising solution to +improve multimodal generative models. + +
+
+
+
+
+ + ♻ ☆ A Baseline Analysis of Reward Models' Ability To Accurately Analyze + Foundation Models Under Distribution Shift + + +
+ Foundation models, specifically Large Language Models (LLM's), have lately +gained wide-spread attention and adoption. Reinforcement Learning with Human +Feedback (RLHF) involves training a reward model to capture desired behaviors, +which is then used to align an LLM. These reward models are additionally used +at inference-time to estimate how well LLM responses adhere to those desired +behaviors. However, there is little work measuring how robust these reward +models are to distribution shifts. In this work, we evaluate how reward model +performance - measured via accuracy and calibration (i.e. alignment between +accuracy and confidence) - is affected by distribution shift. We show novel +calibration patterns and accuracy drops due to OOD prompts and responses, and +that the reward model is more sensitive to shifts in responses than prompts. +Additionally, we adapt an OOD detection technique commonly used in +classification to the reward model setting in order to detect these +distribution shifts in prompts and responses. + +
+
+
+
+
+ + ♻ ☆ TimeGNN: Temporal Dynamic Graph Learning for Time Series Forecasting + + +
+ Time series forecasting lies at the core of important real-world applications +in many fields of science and engineering. The abundance of large time series +datasets that consist of complex patterns and long-term dependencies has led to +the development of various neural network architectures. Graph neural network +approaches, which jointly learn a graph structure based on the correlation of +raw values of multivariate time series while forecasting, have recently seen +great success. However, such solutions are often costly to train and difficult +to scale. In this paper, we propose TimeGNN, a method that learns dynamic +temporal graph representations that can capture the evolution of inter-series +patterns along with the correlations of multiple series. TimeGNN achieves +inference times 4 to 80 times faster than other state-of-the-art graph-based +methods while achieving comparable forecasting performance + +
+
+
+
+
+ + ♻ ☆ Locally Differentially Private Document Generation Using Zero Shot + Prompting EMNLP 2023 + + +
+ Numerous studies have highlighted the privacy risks associated with +pretrained large language models. In contrast, our research offers a unique +perspective by demonstrating that pretrained large language models can +effectively contribute to privacy preservation. We propose a locally +differentially private mechanism called DP-Prompt, which leverages the power of +pretrained large language models and zero-shot prompting to counter author +de-anonymization attacks while minimizing the impact on downstream utility. +When DP-Prompt is used with a powerful language model like ChatGPT (gpt-3.5), +we observe a notable reduction in the success rate of de-anonymization attacks, +showing that it surpasses existing approaches by a considerable margin despite +its simpler design. For instance, in the case of the IMDB dataset, DP-Prompt +(with ChatGPT) perfectly recovers the clean sentiment F1 score while achieving +a 46\% reduction in author identification F1 score against static attackers and +a 26\% reduction against adaptive attackers. We conduct extensive experiments +across six open-source large language models, ranging up to 7 billion +parameters, to analyze various effects of the privacy-utility tradeoff. + +
+
+ comment: Accepted at EMNLP 2023 (Findings) +
+
+
+
+
+ + ♻ ☆ Two-step reinforcement learning for model-free redesign of nonlinear + optimal regulator + + +
+ In many practical control applications, the performance level of a +closed-loop system degrades over time due to the change of plant +characteristics. Thus, there is a strong need for redesigning a controller +without going through the system modeling process, which is often difficult for +closed-loop systems. Reinforcement learning (RL) is one of the promising +approaches that enable model-free redesign of optimal controllers for nonlinear +dynamical systems based only on the measurement of the closed-loop system. +However, the learning process of RL usually requires a considerable number of +trial-and-error experiments using the poorly controlled system that may +accumulate wear on the plant. To overcome this limitation, we propose a +model-free two-step design approach that improves the transient learning +performance of RL in an optimal regulator redesign problem for unknown +nonlinear systems. Specifically, we first design a linear control law that +attains some degree of control performance in a model-free manner, and then, +train the nonlinear optimal control law with online RL by using the designed +linear control law in parallel. We introduce an offline RL algorithm for the +design of the linear control law and theoretically guarantee its convergence to +the LQR controller under mild assumptions. Numerical simulations show that the +proposed approach improves the transient learning performance and efficiency in +hyperparameter tuning of RL. + +
+
+
+
+
+ + ♻ ☆ LocoMuJoCo: A Comprehensive Imitation Learning Benchmark for Locomotion + + +
+ Imitation Learning (IL) holds great promise for enabling agile locomotion in +embodied agents. However, many existing locomotion benchmarks primarily focus +on simplified toy tasks, often failing to capture the complexity of real-world +scenarios and steering research toward unrealistic domains. To advance research +in IL for locomotion, we present a novel benchmark designed to facilitate +rigorous evaluation and comparison of IL algorithms. This benchmark encompasses +a diverse set of environments, including quadrupeds, bipeds, and +musculoskeletal human models, each accompanied by comprehensive datasets, such +as real noisy motion capture data, ground truth expert data, and ground truth +sub-optimal data, enabling evaluation across a spectrum of difficulty levels. +To increase the robustness of learned agents, we provide an easy interface for +dynamics randomization and offer a wide range of partially observable tasks to +train agents across different embodiments. Finally, we provide handcrafted +metrics for each task and ship our benchmark with state-of-the-art baseline +algorithms to ease evaluation and enable fast benchmarking. + +
+
+ comment: https://github.com/robfiras/loco-mujoco +
+
+
+
+
+ + ♻ ☆ Understanding Sample Generation Strategies for Learning Heuristic + Functions in Classical Planning + + +
+ We study the problem of learning good heuristic functions for classical +planning tasks with neural networks based on samples represented by states with +their cost-to-goal estimates. The heuristic function is learned for a state +space and goal condition with the number of samples limited to a fraction of +the size of the state space, and must generalize well for all states of the +state space with the same goal condition. Our main goal is to better understand +the influence of sample generation strategies on the performance of a greedy +best-first heuristic search (GBFS) guided by a learned heuristic function. In a +set of controlled experiments, we find that two main factors determine the +quality of the learned heuristic: which states are included in the sample set +and the quality of the cost-to-goal estimates. These two factors are dependent: +having perfect cost-to-goal estimates is insufficient if the samples are not +well distributed across the state space. We also study other effects, such as +adding samples with high-value estimates. Based on our findings, we propose +practical strategies to improve the quality of learned heuristics: three +strategies that aim to generate more representative states and two strategies +that improve the cost-to-goal estimates. Our practical strategies almost double +the mean coverage of a GBFS algorithm guided by a learned heuristic. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Focused Transformer: Contrastive Training for Context Scaling NeurIPS 2023 + + +
+ Large language models have an exceptional capability to incorporate new +information in a contextual manner. However, the full potential of such an +approach is often restrained due to a limitation in the effective context +length. One solution to this issue is to endow an attention layer with access +to an external memory, which comprises of (key, value) pairs. Yet, as the +number of documents increases, the proportion of relevant keys to irrelevant +ones decreases, leading the model to focus more on the irrelevant keys. We +identify a significant challenge, dubbed the distraction issue, where keys +linked to different semantic values might overlap, making them hard to +distinguish. To tackle this problem, we introduce the Focused Transformer +(FoT), a technique that employs a training process inspired by contrastive +learning. This novel approach enhances the structure of the (key, value) space, +enabling an extension of the context length. Our method allows for fine-tuning +pre-existing, large-scale models to lengthen their effective context. This is +demonstrated by our fine-tuning of $3B$ and $7B$ OpenLLaMA checkpoints. The +resulting models, which we name LongLLaMA, exhibit advancements in tasks +requiring a long context. We further illustrate that our LongLLaMA models +adeptly manage a $256 k$ context length for passkey retrieval. + +
+
+ comment: Accepted at 37th Conference on Neural Information Processing Systems + (NeurIPS 2023). 28 pages, 10 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Hessian-Aware Bayesian Optimization for Decision Making Systems ICLR + + +
+ Many approaches for optimizing decision making systems rely on gradient based +methods requiring informative feedback from the environment. However, in the +case where such feedback is sparse or uninformative, such approaches may result +in poor performance. Derivative-free approaches such as Bayesian Optimization +mitigate the dependency on the quality of gradient feedback, but are known to +scale poorly in the high-dimension setting of complex decision making systems. +This problem is exacerbated if the system requires interactions between several +actors cooperating to accomplish a shared goal. To address the dimensionality +challenge, we propose a compact multi-layered architecture modeling the +dynamics of actor interactions through the concept of role. Additionally, we +introduce Hessian-aware Bayesian Optimization to efficiently optimize the +multi-layered architecture parameterized by a large number of parameters. +Experimental results demonstrate that our method (HA-GP-UCB) works effectively +on several benchmarks under resource constraints and malformed feedback +settings. + +
+
+ comment: Revision after ICLR feedback +
+
+
+
+
+ + ♻ ☆ Bayesian CART models for insurance claims frequency + + +
+ Accuracy and interpretability of a (non-life) insurance pricing model are +essential qualities to ensure fair and transparent premiums for policy-holders, +that reflect their risk. In recent years, the classification and regression +trees (CARTs) and their ensembles have gained popularity in the actuarial +literature, since they offer good prediction performance and are relatively +easily interpretable. In this paper, we introduce Bayesian CART models for +insurance pricing, with a particular focus on claims frequency modelling. +Additionally to the common Poisson and negative binomial (NB) distributions +used for claims frequency, we implement Bayesian CART for the zero-inflated +Poisson (ZIP) distribution to address the difficulty arising from the +imbalanced insurance claims data. To this end, we introduce a general MCMC +algorithm using data augmentation methods for posterior tree exploration. We +also introduce the deviance information criterion (DIC) for the tree model +selection. The proposed models are able to identify trees which can better +classify the policy-holders into risk groups. Some simulations and real +insurance data will be discussed to illustrate the applicability of these +models. + +
+
+ comment: 46 pages +
+
+
+
+
+ + ♻ ☆ Some Intriguing Aspects about Lipschitz Continuity of Neural Networks + + +
+ Lipschitz continuity is a crucial functional property of any predictive +model, that naturally governs its robustness, generalisation, as well as +adversarial vulnerability. Contrary to other works that focus on obtaining +tighter bounds and developing different practical strategies to enforce certain +Lipschitz properties, we aim to thoroughly examine and characterise the +Lipschitz behaviour of Neural Networks. Thus, we carry out an empirical +investigation in a range of different settings (namely, architectures, +datasets, label noise, and more) by exhausting the limits of the simplest and +the most general lower and upper bounds. As a highlight of this investigation, +we showcase a remarkable fidelity of the lower Lipschitz bound, identify a +striking Double Descent trend in both upper and lower bounds to the Lipschitz +and explain the intriguing effects of label noise on function smoothness and +generalisation. + +
+
+
+
+
+ + ♻ ☆ ANPL: Towards Natural Programming with Interactive Decomposition + + +
+ Though LLMs are capable of generating plausible programs, it's challenging to +interact with the LLMs further to revise the program, especially if the user's +specific requirements are different from the initial proposal. In this paper, +we introduce ANPL, an interactive programming system that ensures users can +always refine the generated code towards their specific programmatic intents +via structured decompositions. Borrowing the paradigm of sketching from program +synthesis, an ANPL program consists of a set of input-outputs that it must +satisfy, a ``sketch'' -- control/data flow expressed in precise code (e.g. +Python), and ``holes'' -- sub-modules to be implemented by the LLM specified +with natural language. The user revises an ANPL program by either modifying the +sketch, changing the language used to describe the holes, or providing +additional input-outputs to a particular hole, turning it into a sub-ANPL +program that can be solved recursively. This workflow allows the users to +offload programming burdens to the LLM as much as possible while retaining the +ability to pinpoint and resolve bugs locally, without exposing the rest of the +program to the LLM. We deploy ANPL on the Abstraction and Reasoning Corpus +(ARC), a set of unique tasks that are challenging for state-of-the-art AI +systems, showing it outperforms baseline programming systems that (a) without +the ability to decompose tasks interactively and (b) without the guarantee that +the modules can be correctly composed together. Additional evaluations on APPS, +HumanEval, and real-world programming tasks have validated that the ANPL +framework is applicable to multiple programming domains. We release the ANPL +solutions to the ARC tasks as a dataset, providing insights into how humans +decompose novel tasks programmatically. See our code at +https://iprc-dip.github.io/ANPL/. + +
+
+
+
+
+ + ♻ ☆ Leveraging Low-Rank and Sparse Recurrent Connectivity for Robust + Closed-Loop Control + + +
+ Developing autonomous agents that can interact with changing environments is +an open challenge in machine learning. Robustness is particularly important in +these settings as agents are often fit offline on expert demonstrations but +deployed online where they must generalize to the closed feedback loop within +the environment. In this work, we explore the application of recurrent neural +networks to tasks of this nature and understand how a parameterization of their +recurrent connectivity influences robustness in closed-loop settings. +Specifically, we represent the recurrent connectivity as a function of rank and +sparsity and show both theoretically and empirically that modulating these two +variables has desirable effects on network dynamics. The proposed low-rank, +sparse connectivity induces an interpretable prior on the network that proves +to be most amenable for a class of models known as closed-form continuous-time +neural networks (CfCs). We find that CfCs with fewer parameters can outperform +their full-rank, fully-connected counterparts in the online setting under +distribution shift. This yields memory-efficient and robust agents while +opening a new perspective on how we can modulate network dynamics through +connectivity. + +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a comprehensive instruction dataset +designed for the biomolecular domain. Mol-Instructions encompasses three key +components: molecule-oriented instructions, protein-oriented instructions, and +biomolecular text instructions. Each component aims to improve the +understanding and prediction capabilities of LLMs concerning biomolecular +features and behaviors. Through extensive instruction tuning experiments on +LLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large +models' performance in the intricate realm of biomolecular studies, thus +fostering progress in the biomolecular research community. Mol-Instructions is +publicly available for ongoing research and will undergo regular updates to +enhance its applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions, add + more experiments +
+
+
+
+
+ + ♻ ☆ Less is More -- Towards parsimonious multi-task models using structured + sparsity + + +
+ Model sparsification in deep learning promotes simpler, more interpretable +models with fewer parameters. This not only reduces the model's memory +footprint and computational needs but also shortens inference time. This work +focuses on creating sparse models optimized for multiple tasks with fewer +parameters. These parsimonious models also possess the potential to match or +outperform dense models in terms of performance. In this work, we introduce +channel-wise l1/l2 group sparsity in the shared convolutional layers parameters +(or weights) of the multi-task learning model. This approach facilitates the +removal of extraneous groups i.e., channels (due to l1 regularization) and also +imposes a penalty on the weights, further enhancing the learning efficiency for +all tasks (due to l2 regularization). We analyzed the results of group sparsity +in both single-task and multi-task settings on two widely-used Multi-Task +Learning (MTL) datasets: NYU-v2 and CelebAMask-HQ. On both datasets, which +consist of three different computer vision tasks each, multi-task models with +approximately 70% sparsity outperform their dense equivalents. We also +investigate how changing the degree of sparsification influences the model's +performance, the overall sparsity percentage, the patterns of sparsity, and the +inference time. + +
+
+ comment: accepted at First Conference on Parsimony and Learning (CPAL 2024) +
+
+
+
+
+ + ♻ ☆ KOPPA: Improving Prompt-based Continual Learning with Key-Query + Orthogonal Projection and Prototype-based One-Versus-All + + +
+ Drawing inspiration from prompt tuning techniques applied to Large Language +Models, recent methods based on pre-trained ViT networks have achieved +remarkable results in the field of Continual Learning. Specifically, these +approaches propose to maintain a set of prompts and allocate a subset of them +to learn each task using a key-query matching strategy. However, they may +encounter limitations when lacking control over the correlations between old +task queries and keys of future tasks, the shift of features in the latent +space, and the relative separation of latent vectors learned in independent +tasks. In this work, we introduce a novel key-query learning strategy based on +orthogonal projection, inspired by model-agnostic meta-learning, to enhance +prompt matching efficiency and address the challenge of shifting features. +Furthermore, we introduce a One-Versus-All (OVA) prototype-based component that +enhances the classification head distinction. Experimental results on benchmark +datasets demonstrate that our method empowers the model to achieve results +surpassing those of current state-of-the-art approaches by a large margin of up +to 20%. + +
+
+
+
+
+ + ♻ ☆ ClustML: A Measure of Cluster Pattern Complexity in Scatterplots Learnt + from Human-labeled Groupings + + +
+ Visual quality measures (VQMs) are designed to support analysts by +automatically detecting and quantifying patterns in visualizations. We propose +a new VQM for visual grouping patterns in scatterplots, called ClustML, which +is trained on previously collected human subject judgments. Our model encodes +scatterplots in the parametric space of a Gaussian Mixture Model and uses a +classifier trained on human judgment data to estimate the perceptual complexity +of grouping patterns. The numbers of initial mixture components and final +combined groups. It improves on existing VQMs, first, by better estimating +human judgments on two-Gaussian cluster patterns and, second, by giving higher +accuracy when ranking general cluster patterns in scatterplots. We use it to +analyze kinship data for genome-wide association studies, in which experts rely +on the visual analysis of large sets of scatterplots. We make the benchmark +datasets and the new VQM available for practical use and further improvements. + +
+
+
+
+
+ + ♻ ☆ Aggregated f-average Neural Network for Interpretable Ensembling + + +
+ Ensemble learning leverages multiple models (i.e., weak learners) on a common +machine learning task to enhance prediction performance. Basic ensembling +approaches average the weak learners outputs, while more sophisticated ones +stack a machine learning model in between the weak learners outputs and the +final prediction. This work fuses both aforementioned frameworks. We introduce +an aggregated f-average (AFA) shallow neural network which models and combines +different types of averages to perform an optimal aggregation of the weak +learners predictions. We emphasise its interpretable architecture and simple +training strategy, and illustrate its good performance on the problem of +few-shot class incremental learning. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ Extending Explainable Boosting Machines to Scientific Image Data + + +
+ As the deployment of computer vision technology becomes increasingly common +in science, the need for explanations of the system and its output has become a +focus of great concern. Driven by the pressing need for interpretable models in +science, we propose the use of Explainable Boosting Machines (EBMs) for +scientific image data. Inspired by an important application underpinning the +development of quantum technologies, we apply EBMs to cold-atom soliton image +data tabularized using Gabor Wavelet Transform-based techniques that preserve +the spatial structure of the data. In doing so, we demonstrate the use of EBMs +for image data for the first time and show that our approach provides +explanations that are consistent with human intuition about the data. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Likelihood-based Sensor Calibration using Affine Transformation + + +
+ An important task in the field of sensor technology is the efficient +implementation of adaptation procedures of measurements from one sensor to +another sensor of identical design. One idea is to use the estimation of an +affine transformation between different systems, which can be improved by the +knowledge of experts. This paper presents an improved solution from Glacier +Research that was published back in 1973. The results demonstrate the +adaptability of this solution for various applications, including software +calibration of sensors, implementation of expert-based adaptation, and paving +the way for future advancements such as distributed learning methods. One idea +here is to use the knowledge of experts for estimating an affine transformation +between different systems. We evaluate our research with simulations and also +with real measured data of a multi-sensor board with 8 identical sensors. Both +data set and evaluation script are provided for download. The results show an +improvement for both the simulation and the experiments with real data. + +
+
+
+
+
+ + ♻ ☆ Learning Deep O($n$)-Equivariant Hyperspheres + + +
+ This paper presents an approach to learning (deep) $n$D features equivariant +under orthogonal transformations, utilizing hyperspheres and regular +$n$-simplexes. Our main contributions are theoretical and tackle major +challenges in geometric deep learning such as equivariance and invariance under +geometric transformations. Namely, we enrich the recently developed theory of +steerable 3D spherical neurons -- SO(3)-equivariant filter banks based on +neurons with spherical decision surfaces -- by extending said neurons to $n$D, +which we call deep equivariant hyperspheres, and enabling their multi-layer +construction. Using synthetic and real-world data in $n$D, we experimentally +verify our theoretical contributions and find that our approach is superior to +the competing methods for benchmark datasets in all but one case, additionally +demonstrating a better speed/performance trade-off in all but one other case. + +
+
+
+
+
+ + ♻ ☆ Operator Learning with Neural Fields: Tackling PDEs on General + Geometries + + +
+ Machine learning approaches for solving partial differential equations +require learning mappings between function spaces. While convolutional or graph +neural networks are constrained to discretized functions, neural operators +present a promising milestone toward mapping functions directly. Despite +impressive results they still face challenges with respect to the domain +geometry and typically rely on some form of discretization. In order to +alleviate such limitations, we present CORAL, a new method that leverages +coordinate-based networks for solving PDEs on general geometries. CORAL is +designed to remove constraints on the input mesh, making it applicable to any +spatial sampling and geometry. Its ability extends to diverse problem domains, +including PDE solving, spatio-temporal forecasting, and inverse problems like +geometric design. CORAL demonstrates robust performance across multiple +resolutions and performs well in both convex and non-convex domains, surpassing +or performing on par with state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Joint Graph Learning and Multivariate Time Series + Forecasting NeurIPS 2023 + + +
+ Multivariate time series is prevalent in many scientific and industrial +domains. Modeling multivariate signals is challenging due to their long-range +temporal dependencies and intricate interactions--both direct and indirect. To +confront these complexities, we introduce a method of representing multivariate +signals as nodes in a graph with edges indicating interdependency between them. +Specifically, we leverage graph neural networks (GNN) and attention mechanisms +to efficiently learn the underlying relationships within the time series data. +Moreover, we suggest employing hierarchical signal decompositions running over +the graphs to capture multiple spatial dependencies. The effectiveness of our +proposed model is evaluated across various real-world benchmark datasets +designed for long-term forecasting tasks. The results consistently showcase the +superiority of our model, achieving an average 23\% reduction in mean squared +error (MSE) compared to existing models. + +
+
+ comment: Temporal Graph Learning Workshop @ NeurIPS 2023, New Orleans, United + States +
+
+
+
+
+ + ♻ ☆ Relating graph auto-encoders to linear models + + +
+ Graph auto-encoders are widely used to construct graph representations in +Euclidean vector spaces. However, it has already been pointed out empirically +that linear models on many tasks can outperform graph auto-encoders. In our +work, we prove that the solution space induced by graph auto-encoders is a +subset of the solution space of a linear map. This demonstrates that linear +embedding models have at least the representational power of graph +auto-encoders based on graph convolutional networks. So why are we still using +nonlinear graph auto-encoders? One reason could be that actively restricting +the linear solution space might introduce an inductive bias that helps improve +learning and generalization. While many researchers believe that the +nonlinearity of the encoder is the critical ingredient towards this end, we +instead identify the node features of the graph as a more powerful inductive +bias. We give theoretical insights by introducing a corresponding bias in a +linear model and analyzing the change in the solution space. Our experiments +are aligned with other empirical work on this question and show that the linear +encoder can outperform the nonlinear encoder when using feature information. + +
+
+ comment: accepted to TMLR +
+
+
+
+
+ + ♻ ☆ Training a HyperDimensional Computing Classifier using a Threshold on + its Confidence + + +
+ Hyperdimensional computing (HDC) has become popular for light-weight and +energy-efficient machine learning, suitable for wearable Internet-of-Things +(IoT) devices and near-sensor or on-device processing. HDC is computationally +less complex than traditional deep learning algorithms and achieves moderate to +good classification performance. This article proposes to extend the training +procedure in HDC by taking into account not only wrongly classified samples, +but also samples that are correctly classified by the HDC model but with low +confidence. As such, a confidence threshold is introduced that can be tuned for +each dataset to achieve the best classification accuracy. The proposed training +procedure is tested on UCIHAR, CTG, ISOLET and HAND dataset for which the +performance consistently improves compared to the baseline across a range of +confidence threshold values. The extended training procedure also results in a +shift towards higher confidence values of the correctly classified samples +making the classifier not only more accurate but also more confident about its +predictions. + +
+
+
+
+
+ + ♻ ☆ A Natural Gas Consumption Forecasting System for Continual Learning + Scenarios based on Hoeffding Trees with Change Point Detection Mechanism + + +
+ Forecasting natural gas consumption, considering seasonality and trends, is +crucial in planning its supply and consumption and optimizing the cost of +obtaining it, mainly by industrial entities. However, in times of threats to +its supply, it is also a critical element that guarantees the supply of this +raw material to meet individual consumers' needs, ensuring society's energy +security. This article introduces a novel multistep ahead forecasting of +natural gas consumption with change point detection integration for model +collection selection with continual learning capabilities using data stream +processing. The performance of the forecasting models based on the proposed +approach is evaluated in a complex real-world use case of natural gas +consumption forecasting. We employed Hoeffding tree predictors as forecasting +models and the Pruned Exact Linear Time (PELT) algorithm for the change point +detection procedure. The change point detection integration enables selecting a +different model collection for successive time frames. Thus, three model +collection selection procedures (with and without an error feedback loop) are +defined and evaluated for forecasting scenarios with various densities of +detected change points. These models were compared with change point agnostic +baseline approaches. Our experiments show that fewer change points result in a +lower forecasting error regardless of the model collection selection procedure +employed. Also, simpler model collection selection procedures omitting +forecasting error feedback leads to more robust forecasting models suitable for +continual learning tasks. + +
+
+
+
+
+ + ♻ ☆ On the Convergence of the ELBO to Entropy Sums + + +
+ The variational lower bound (a.k.a. ELBO or free energy) is the central +objective for many established as well as many novel algorithms for +unsupervised learning. Learning algorithms change model parameters such that +the variational lower bound increases. Learning usually proceeds until +parameters have converged to values close to a stationary point of the learning +dynamics. In this purely theoretical contribution, we show that (for a very +large class of generative models) the variational lower bound is at all +stationary points of learning equal to a sum of entropies. For standard machine +learning models with one set of latents and one set observed variables, the sum +consists of three entropies: (A) the (average) entropy of the variational +distributions, (B) the negative entropy of the model's prior distribution, and +(C) the (expected) negative entropy of the observable distributions. The +obtained result applies under realistic conditions including: finite numbers of +data points, at any stationary points (including saddle points) and for any +family of (well behaved) variational distributions. The class of generative +models for which we show the equality to entropy sums contains many well-known +generative models. As concrete examples we discuss Sigmoid Belief Networks, +probabilistic PCA and (Gaussian and non-Gaussian) mixture models. The results +also apply for standard (Gaussian) variational autoencoders, which has been +shown in parallel (Damm et al., 2023). The prerequisites we use to show +equality to entropy sums are relatively mild. Concretely, the distributions of +a given generative model have to be of the exponential family (with constant +base measure), and the model has to satisfy a parameterization criterion (which +is usually fulfilled). Proving the equality of the ELBO to entropy sums at +stationary points (under the stated conditions) is the main contribution of +this work. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Towards Responsible Governance of Biological Design Tools NeurIPS 2023 + + +
+ Recent advancements in generative machine learning have enabled rapid +progress in biological design tools (BDTs) such as protein structure and +sequence prediction models. The unprecedented predictive accuracy and novel +design capabilities of BDTs present new and significant dual-use risks. For +example, their predictive accuracy allows biological agents, whether vaccines +or pathogens, to be developed more quickly, while the design capabilities could +be used to discover drugs or evade DNA screening techniques. Similar to other +dual-use AI systems, BDTs present a wicked problem: how can regulators uphold +public safety without stifling innovation? We highlight how current regulatory +proposals that are primarily tailored toward large language models may be less +effective for BDTs, which require fewer computational resources to train and +are often developed in an open-source manner. We propose a range of measures to +mitigate the risk that BDTs are misused, across the areas of responsible +development, risk assessment, transparency, access management, cybersecurity, +and investing in resilience. Implementing such measures will require close +coordination between developers and governments. + +
+
+ comment: 10 pages + references, 1 figure, accepted at NeurIPS 2023 Workshop on + Regulatable ML as oral presentation +
+
+
+
+
+ + ♻ ☆ Unsupervised Discovery of Interpretable Directions in h-space of + Pre-trained Diffusion Models + + +
+ We propose the first unsupervised and learning-based method to identify +interpretable directions in h-space of pre-trained diffusion models. Our method +is derived from an existing technique that operates on the GAN latent space. +Specifically, we employ a shift control module that works on h-space of +pre-trained diffusion models to manipulate a sample into a shifted version of +itself, followed by a reconstructor to reproduce both the type and the strength +of the manipulation. By jointly optimizing them, the model will spontaneously +discover disentangled and interpretable directions. To prevent the discovery of +meaningless and destructive directions, we employ a discriminator to maintain +the fidelity of shifted sample. Due to the iterative generative process of +diffusion models, our training requires a substantial amount of GPU VRAM to +store numerous intermediate tensors for back-propagating gradient. To address +this issue, we propose a general VRAM-efficient training algorithm based on +gradient checkpointing technique to back-propagate any gradient through the +whole generative process, with acceptable occupancy of VRAM and sacrifice of +training efficiency. Compared with existing related works on diffusion models, +our method inherently identifies global and scalable directions, without +necessitating any other complicated procedures. Extensive experiments on +various datasets demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Analyzing Semantic Faithfulness of Language Models via Input + Intervention on Question Answering + + +
+ Transformer-based language models have been shown to be highly effective for +several NLP tasks. In this paper, we consider three transformer models, BERT, +RoBERTa, and XLNet, in both small and large versions, and investigate how +faithful their representations are with respect to the semantic content of +texts. We formalize a notion of semantic faithfulness, in which the semantic +content of a text should causally figure in a model's inferences in question +answering. We then test this notion by observing a model's behavior on +answering questions about a story after performing two novel semantic +interventions: deletion intervention and negation intervention. While +transformer models achieve high performance on standard question answering +tasks, we show that they fail to be semantically faithful once we perform these +interventions for a significant number of cases (~50% for deletion +intervention, and ~20% drop in accuracy for negation intervention). We then +propose an intervention-based training regime that can mitigate the undesirable +effects for deletion intervention by a significant margin (from ~ 50% to ~6%). +We analyze the inner-workings of the models to better understand the +effectiveness of intervention-based training for deletion intervention. But we +show that this training does not attenuate other aspects of semantic +unfaithfulness such as the models' inability to deal with negation intervention +or to capture the predicate-argument structure of texts. We also test +InstructGPT, via prompting, for its ability to handle the two interventions and +to capture predicate-argument structure. While InstructGPT models do achieve +very high performance on predicate-argument structure task, they fail to +respond adequately to our deletion and negation interventions. + +
+
+
+
+
+ + ♻ ☆ Language Models as Black-Box Optimizers for Vision-Language Models + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities on downstream tasks when fine-tuned with +minimal data. However, many VLMs rely on proprietary data and are not +open-source, which restricts the use of white-box approaches for fine-tuning. +As such, we aim to develop a black-box approach to optimize VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or even output logits. We propose employing chat-based LLMs +to search for the best text prompt for VLMs. Specifically, we adopt an +automatic hill-climbing procedure that converges to an effective prompt by +evaluating the performance of current prompts and asking LLMs to refine them +based on textual feedback, all within a conversational process without +human-in-the-loop. In a challenging 1-shot image classification setup, our +simple approach surpasses the white-box continuous prompting method (CoOp) by +an average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms both human-engineered and LLM-generated prompts. We highlight the +advantage of conversational feedback that incorporates both positive and +negative prompts, suggesting that LLMs can utilize the implicit gradient +direction in textual feedback for a more efficient search. In addition, we find +that the text prompts generated through our strategy are not only more +interpretable but also transfer well across different VLM architectures in a +black-box manner. Lastly, we demonstrate our framework on a state-of-the-art +black-box VLM (DALL-E 3) for text-to-image optimization. + +
+
+ comment: Project site: llm-can-optimize-vlm.github.io +
+
+
+
+
+ + ♻ ☆ Generative Models for Anomaly Detection and Design-Space Dimensionality + Reduction in Shape Optimization + + +
+ Our work presents a novel approach to shape optimization, with the twofold +objective to improve the efficiency of global optimization algorithms while +promoting the generation of high-quality designs during the optimization +process free of geometrical anomalies. This is accomplished by reducing the +number of the original design variables defining a new reduced subspace where +the geometrical variance is maximized and modeling the underlying generative +process of the data via probabilistic linear latent variable models such as +factor analysis and probabilistic principal component analysis. We show that +the data follows approximately a Gaussian distribution when the shape +modification method is linear and the design variables are sampled uniformly at +random, due to the direct application of the central limit theorem. The degree +of anomalousness is measured in terms of Mahalanobis distance, and the paper +demonstrates that abnormal designs tend to exhibit a high value of this metric. +This enables the definition of a new optimization model where anomalous +geometries are penalized and consequently avoided during the optimization loop. +The procedure is demonstrated for hull shape optimization of the DTMB 5415 +model, extensively used as an international benchmark for shape optimization +problems. The global optimization routine is carried out using Bayesian +optimization and the DIRECT algorithm. From the numerical results, the new +framework improves the convergence of global optimization algorithms, while +only designs with high-quality geometrical features are generated through the +optimization routine thereby avoiding the wastage of precious computationally +expensive simulations. + +
+
+ comment: Accepted in Engineering Applications of Artificial Intelligence, + Elsevier +
+
+
+
+
+ + ♻ ☆ Grounding Foundation Models through Federated Transfer Learning: A + General Framework + + +
+ Foundation Models (FMs) such as GPT-4 encoded with vast knowledge and +powerful emergent abilities have achieved remarkable success in various natural +language processing and computer vision tasks. Grounding FMs by adapting them +to domain-specific tasks or augmenting them with domain-specific knowledge +enables us to exploit the full potential of FMs. However, grounding FMs faces +several challenges, stemming primarily from constrained computing resources, +data privacy, model heterogeneity, and model ownership. Federated Transfer +Learning (FTL), the combination of federated learning and transfer learning, +provides promising solutions to address these challenges. In recent years, the +need for grounding FMs leveraging FTL, coined FTL-FM, has arisen strongly in +both academia and industry. Motivated by the strong growth in FTL-FM research +and the potential impact of FTL-FM on industrial applications, we propose an +FTL-FM framework that formulates problems of grounding FMs in the federated +learning setting, construct a detailed taxonomy based on the FTL-FM framework +to categorize state-of-the-art FTL-FM works, and comprehensively overview +FTL-FM works based on the proposed taxonomy. We also establish correspondences +between FTL-FM and conventional phases of adapting FM so that FM practitioners +can align their research works with FTL-FM. In addition, we overview advanced +efficiency-improving and privacy-preserving techniques because efficiency and +privacy are critical concerns in FTL-FM. Last, we discuss opportunities and +future research directions of FTL-FM. + +
+
+ comment: in progress +
+
+
+
+
+ + ♻ ☆ Image retrieval outperforms diffusion models on data augmentation + + +
+ Many approaches have been proposed to use diffusion models to augment +training datasets for downstream tasks, such as classification. However, +diffusion models are themselves trained on large datasets, often with noisy +annotations, and it remains an open question to which extent these models +contribute to downstream classification performance. In particular, it remains +unclear if they generalize enough to improve over directly using the additional +data of their pre-training process for augmentation. We systematically evaluate +a range of existing methods to generate images from diffusion models and study +new extensions to assess their benefit for data augmentation. Personalizing +diffusion models towards the target data outperforms simpler prompting +strategies. However, using the pre-training data of the diffusion model alone, +via a simple nearest-neighbor retrieval procedure, leads to even stronger +downstream performance. Our study explores the potential of diffusion models in +generating new training data, and surprisingly finds that these sophisticated +models are not yet able to beat a simple and strong image retrieval baseline on +simple downstream vision tasks. + +
+
+
+
+
+ + ♻ ☆ LasTGL: An Industrial Framework for Large-Scale Temporal Graph Learning + + +
+ Over the past few years, graph neural networks (GNNs) have become powerful +and practical tools for learning on (static) graph-structure data. However, +many real-world applications, such as social networks and e-commerce, involve +temporal graphs where nodes and edges are dynamically evolving. Temporal graph +neural networks (TGNNs) have progressively emerged as an extension of GNNs to +address time-evolving graphs and have gradually become a trending research +topic in both academics and industry. Advancing research and application in +such an emerging field necessitates the development of new tools to compose +TGNN models and unify their different schemes for dealing with temporal graphs. +In this work, we introduce LasTGL, an industrial framework that integrates +unified and extensible implementations of common temporal graph learning +algorithms for various advanced tasks. The purpose of LasTGL is to provide the +essential building blocks for solving temporal graph learning tasks, focusing +on the guiding principles of user-friendliness and quick prototyping on which +PyTorch is based. In particular, LasTGL provides comprehensive temporal graph +datasets, TGNN models and utilities along with well-documented tutorials, +making it suitable for both absolute beginners and expert deep learning +practitioners alike. + +
+
+ comment: Preprint; Work in progress +
+
+
+
+
+ + ♻ ☆ Editing Large Language Models: Problems, Methods, and Opportunities EMNLP 2023 + + +
+ Despite the ability to train capable LLMs, the methodology for maintaining +their relevancy and rectifying errors remains elusive. To this end, the past +few years have witnessed a surge in techniques for editing LLMs, the objective +of which is to efficiently alter the behavior of LLMs within a specific domain +without negatively impacting performance across other inputs. This paper +embarks on a deep exploration of the problems, methods, and opportunities +related to model editing for LLMs. In particular, we provide an exhaustive +overview of the task definition and challenges associated with model editing, +along with an in-depth empirical analysis of the most progressive methods +currently at our disposal. We also build a new benchmark dataset to facilitate +a more robust evaluation and pinpoint enduring issues intrinsic to existing +techniques. Our objective is to provide valuable insights into the +effectiveness and feasibility of each editing technique, thereby assisting the +community in making informed decisions on the selection of the most appropriate +method for a specific task or context. Code and datasets are available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023. Updated with new experiments +
+
+
+
+
+ + ♻ ☆ Cycle Invariant Positional Encoding for Graph Representation Learning + + +
+ Cycles are fundamental elements in graph-structured data and have +demonstrated their effectiveness in enhancing graph learning models. To encode +such information into a graph learning framework, prior works often extract a +summary quantity, ranging from the number of cycles to the more sophisticated +persistence diagram summaries. However, more detailed information, such as +which edges are encoded in a cycle, has not yet been used in graph neural +networks. In this paper, we make one step towards addressing this gap, and +propose a structure encoding module, called CycleNet, that encodes cycle +information via edge structure encoding in a permutation invariant manner. To +efficiently encode the space of all cycles, we start with a cycle basis (i.e., +a minimal set of cycles generating the cycle space) which we compute via the +kernel of the 1-dimensional Hodge Laplacian of the input graph. To guarantee +the encoding is invariant w.r.t. the choice of cycle basis, we encode the cycle +information via the orthogonal projector of the cycle basis, which is inspired +by BasisNet proposed by Lim et al. We also develop a more efficient variant +which however requires that the input graph has a unique shortest cycle basis. +To demonstrate the effectiveness of the proposed module, we provide some +theoretical understandings of its expressive power. Moreover, we show via a +range of experiments that networks enhanced by our CycleNet module perform +better in various benchmarks compared to several existing SOTA models. + +
+
+ comment: Accepted as oral presentation in the Learning on Graphs Conference + (LoG 2023) +
+
+
+
+
+ + ♻ ☆ Stable Linear Subspace Identification: A Machine Learning Approach + + +
+ Machine Learning (ML) and linear System Identification (SI) have been +historically developed independently. In this paper, we leverage +well-established ML tools - especially the automatic differentiation framework +- to introduce SIMBa, a family of discrete linear multi-step-ahead state-space +SI methods using backpropagation. SIMBa relies on a novel +Linear-Matrix-Inequality-based free parametrization of Schur matrices to ensure +the stability of the identified model. + We show how SIMBa generally outperforms traditional linear state-space SI +methods, and sometimes significantly, although at the price of a higher +computational burden. This performance gap is particularly remarkable compared +to other SI methods with stability guarantees, where the gain is frequently +above 25% in our investigations, hinting at SIMBa's ability to simultaneously +achieve state-of-the-art fitting performance and enforce stability. +Interestingly, these observations hold for a wide variety of input-output +systems and on both simulated and real-world data, showcasing the flexibility +of the proposed approach. We postulate that this new SI paradigm presents a +great extension potential to identify structured nonlinear models from data, +and we hence open-source SIMBa on https://github.com/Cemempamoi/simba. + +
+
+ comment: Submitted to ECC 2024 +
+
+
+
+
+ + ♻ ☆ Deep Double Descent for Time Series Forecasting: Avoiding Undertrained + Models + + +
+ Deep learning models, particularly Transformers, have achieved impressive +results in various domains, including time series forecasting. While existing +time series literature primarily focuses on model architecture modifications +and data augmentation techniques, this paper explores the training schema of +deep learning models for time series; how models are trained regardless of +their architecture. We perform extensive experiments to investigate the +occurrence of deep double descent in several Transformer models trained on +public time series data sets. We demonstrate epoch-wise deep double descent and +that overfitting can be reverted using more epochs. Leveraging these findings, +we achieve state-of-the-art results for long sequence time series forecasting +in nearly 70% of the 72 benchmarks tested. This suggests that many models in +the literature may possess untapped potential. Additionally, we introduce a +taxonomy for classifying training schema modifications, covering data +augmentation, model inputs, model targets, time series per model, and +computational budget. + +
+
+
+
+
+ + ♻ ☆ Handwriting recognition and automatic scoring for descriptive answers in + Japanese language tests + + +
+ This paper presents an experiment of automatically scoring handwritten +descriptive answers in the trial tests for the new Japanese university entrance +examination, which were made for about 120,000 examinees in 2017 and 2018. +There are about 400,000 answers with more than 20 million characters. Although +all answers have been scored by human examiners, handwritten characters are not +labeled. We present our attempt to adapt deep neural network-based handwriting +recognizers trained on a labeled handwriting dataset into this unlabeled answer +set. Our proposed method combines different training strategies, ensembles +multiple recognizers, and uses a language model built from a large general +corpus to avoid overfitting into specific data. In our experiment, the proposed +method records character accuracy of over 97% using about 2,000 verified +labeled answers that account for less than 0.5% of the dataset. Then, the +recognized answers are fed into a pre-trained automatic scoring system based on +the BERT model without correcting misrecognized characters and providing rubric +annotations. The automatic scoring system achieves from 0.84 to 0.98 of +Quadratic Weighted Kappa (QWK). As QWK is over 0.8, it represents an acceptable +similarity of scoring between the automatic scoring system and the human +examiners. These results are promising for further research on end-to-end +automatic scoring of descriptive answers. + +
+
+ comment: Keywords: handwritten Japanese answers, handwriting recognition, + automatic scoring, ensemble recognition, deep neural networks; Reported in + IEICE technical report, PRMU2021-32, pp.45-50 (2021.12) Published after peer + review and Presented in ICFHR2022, Lecture Notes in Computer Science, vol. + 13639, pp. 274-284 (2022.11) +
+
+
+
+
+ + ♻ ☆ Gated Ensemble of Spatio-temporal Mixture of Experts for Multi-task + Learning in Ride-hailing System + + +
+ Designing spatio-temporal forecasting models separately in a task-wise and +city-wise manner poses a burden for the expanding transportation network +companies. Therefore, a multi-task learning architecture is proposed in this +study by developing gated ensemble of spatio-temporal mixture of experts +network (GESME-Net) with convolutional recurrent neural network (CRNN), +convolutional neural network (CNN), and recurrent neural network (RNN) for +simultaneously forecasting spatio-temporal tasks in a city as well as across +different cities. Furthermore, a task adaptation layer is integrated with the +architecture for learning joint representation in multi-task learning and +revealing the contribution of the input features utilized in prediction. The +proposed architecture is tested with data from Didi Chuxing for: (i) +simultaneously forecasting demand and supply-demand gap in Beijing, and (ii) +simultaneously forecasting demand across Chengdu and Xian. In both scenarios, +models from our proposed architecture outperformed the single-task and +multi-task deep learning benchmarks and ensemble-based machine learning +algorithms. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2012.08868 +
+
+
+
+
+ + ♻ ☆ Beyond Prediction: On-street Parking Recommendation using Heterogeneous + Graph-based List-wise Ranking + + +
+ To provide real-time parking information, existing studies focus on +predicting parking availability, which seems an indirect approach to saving +drivers' cruising time. In this paper, we first time propose an on-street +parking recommendation (OPR) task to directly recommend a parking space for a +driver. To this end, a learn-to-rank (LTR) based OPR model called OPR-LTR is +built. Specifically, parking recommendation is closely related to the "turnover +events" (state switching between occupied and vacant) of each parking space, +and hence we design a highly efficient heterogeneous graph called ESGraph to +represent historical and real-time meters' turnover events as well as +geographical relations; afterward, a convolution-based event-then-graph network +is used to aggregate and update representations of the heterogeneous graph. A +ranking model is further utilized to learn a score function that helps +recommend a list of ranked parking spots for a specific on-street parking +query. The method is verified using the on-street parking meter data in Hong +Kong and San Francisco. By comparing with the other two types of methods: +prediction-only and prediction-then-recommendation, the proposed +direct-recommendation method achieves satisfactory performance in different +metrics. Extensive experiments also demonstrate that the proposed ESGraph and +the recommendation model are more efficient in terms of computational +efficiency as well as saving drivers' on-street parking time. + +
+
+
+
+
+ + ♻ ☆ A PSO Based Method to Generate Actionable Counterfactuals for High + Dimensional Data + + +
+ Counterfactual explanations (CFE) are methods that explain a machine learning +model by giving an alternate class prediction of a data point with some minimal +changes in its features. It helps the users to identify their data attributes +that caused an undesirable prediction like a loan or credit card rejection. We +describe an efficient and an actionable counterfactual (CF) generation method +based on particle swarm optimization (PSO). We propose a simple objective +function for the optimization of the instance-centric CF generation problem. +The PSO brings in a lot of flexibility in terms of carrying out multi-objective +optimization in large dimensions, capability for multiple CF generation, and +setting box constraints or immutability of data attributes. An algorithm is +proposed that incorporates these features and it enables greater control over +the proximity and sparsity properties over the generated CFs. The proposed +algorithm is evaluated with a set of action-ability metrics in real-world +datasets, and the results were superior compared to that of the +state-of-the-arts. + +
+
+ comment: Accepted in IEEE CSDE 2023 +
+
+
+
+
+ + ♻ ☆ Fair Community Detection and Structure Learning in Heterogeneous + Graphical Models + + +
+ Inference of community structure in probabilistic graphical models may not be +consistent with fairness constraints when nodes have demographic attributes. +Certain demographics may be over-represented in some detected communities and +under-represented in others. This paper defines a novel $\ell_1$-regularized +pseudo-likelihood approach for fair graphical model selection. In particular, +we assume there is some community or clustering structure in the true +underlying graph, and we seek to learn a sparse undirected graph and its +communities from the data such that demographic groups are fairly represented +within the communities. In the case when the graph is known a priori, we +provide a convex semidefinite programming approach for fair community +detection. We establish the statistical consistency of the proposed method for +both a Gaussian graphical model and an Ising model for, respectively, +continuous and binary data, proving that our method can recover the graphs and +their fair communities with high probability. + +
+
+
+
+
+ + ♻ ☆ On the Robustness of Decision-Focused Learning AAAI + + +
+ Decision-Focused Learning (DFL) is an emerging learning paradigm that tackles +the task of training a machine learning (ML) model to predict missing +parameters of an incomplete optimization problem, where the missing parameters +are predicted. DFL trains an ML model in an end-to-end system, by integrating +the prediction and optimization tasks, providing better alignment of the +training and testing objectives. DFL has shown a lot of promise and holds the +capacity to revolutionize decision-making in many real-world applications. +However, very little is known about the performance of these models under +adversarial attacks. We adopt ten unique DFL methods and benchmark their +performance under two distinctly focused attacks adapted towards the +Predict-then-Optimize problem setting. Our study proposes the hypothesis that +the robustness of a model is highly correlated with its ability to find +predictions that lead to optimal decisions without deviating from the +ground-truth label. Furthermore, we provide insight into how to target the +models that violate this condition and show how these models respond +differently depending on the achieved optimality at the end of their training +cycles. + +
+
+ comment: 17 pages, 45 figures, submitted to AAAI artificial intelligence for + operations research workshop +
+
+
+
+
+ + ♻ ☆ GNNFlow: A Distributed Framework for Continuous Temporal GNN Learning on + Dynamic Graphs + + +
+ Graph Neural Networks (GNNs) play a crucial role in various fields. However, +most existing deep graph learning frameworks assume pre-stored static graphs +and do not support training on graph streams. In contrast, many real-world +graphs are dynamic and contain time domain information. We introduce GNNFlow, a +distributed framework that enables efficient continuous temporal graph +representation learning on dynamic graphs on multi-GPU machines. GNNFlow +introduces an adaptive time-indexed block-based data structure that effectively +balances memory usage with graph update and sampling operation efficiency. It +features a hybrid GPU-CPU graph data placement for rapid GPU-based temporal +neighborhood sampling and kernel optimizations for enhanced sampling processes. +A dynamic GPU cache for node and edge features is developed to maximize cache +hit rates through reuse and restoration strategies. GNNFlow supports +distributed training across multiple machines with static scheduling to ensure +load balance. We implement GNNFlow based on DGL and PyTorch. Our experimental +results show that GNNFlow provides up to 21.1x faster continuous learning than +existing systems. + +
+
+
+
+
+ + ♻ ☆ Fantastic Weights and How to Find Them: Where to Prune in Dynamic Sparse + Training NeurIPS 2023 + + +
+ Dynamic Sparse Training (DST) is a rapidly evolving area of research that +seeks to optimize the sparse initialization of a neural network by adapting its +topology during training. It has been shown that under specific conditions, DST +is able to outperform dense models. The key components of this framework are +the pruning and growing criteria, which are repeatedly applied during the +training process to adjust the network's sparse connectivity. While the growing +criterion's impact on DST performance is relatively well studied, the influence +of the pruning criterion remains overlooked. To address this issue, we design +and perform an extensive empirical analysis of various pruning criteria to +better understand their impact on the dynamics of DST solutions. Surprisingly, +we find that most of the studied methods yield similar results. The differences +become more significant in the low-density regime, where the best performance +is predominantly given by the simplest technique: magnitude-based pruning. The +code is provided at https://github.com/alooow/fantastic_weights_paper + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Dodging DeepFake Detection via Implicit Spatial-Domain Notch Filtering + + +
+ The current high-fidelity generation and high-precision detection of DeepFake +images are at an arms race. We believe that producing DeepFakes that are highly +realistic and 'detection evasive' can serve the ultimate goal of improving +future generation DeepFake detection capabilities. In this paper, we propose a +simple yet powerful pipeline to reduce the artifact patterns of fake images +without hurting image quality by performing implicit spatial-domain notch +filtering. We first demonstrate that frequency-domain notch filtering, although +famously shown to be effective in removing periodic noise in the spatial +domain, is infeasible for our task at hand due to the manual designs required +for the notch filters. We, therefore, resort to a learning-based approach to +reproduce the notch filtering effects, but solely in the spatial domain. We +adopt a combination of adding overwhelming spatial noise for breaking the +periodic noise pattern and deep image filtering to reconstruct the noise-free +fake images, and we name our method DeepNotch. Deep image filtering provides a +specialized filter for each pixel in the noisy image, producing filtered images +with high fidelity compared to their DeepFake counterparts. Moreover, we also +use the semantic information of the image to generate an adversarial guidance +map to add noise intelligently. Our large-scale evaluation on 3 representative +state-of-the-art DeepFake detection methods (tested on 16 types of DeepFakes) +has demonstrated that our technique significantly reduces the accuracy of these +3 fake image detection methods, 36.79% on average and up to 97.02% in the best +case. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Applying Bayesian Ridge Regression AI Modeling in Virus Severity + Prediction + + +
+ Artificial intelligence (AI) is a powerful tool for reshaping healthcare +systems. In healthcare, AI is invaluable for its capacity to manage vast +amounts of data, which can lead to more accurate and speedy diagnoses, +ultimately easing the workload on healthcare professionals. As a result, AI has +proven itself to be a power tool across various industries, simplifying complex +tasks and pattern recognition that would otherwise be overwhelming for humans +or traditional computer algorithms. In this paper, we review the strengths and +weaknesses of Bayesian Ridge Regression, an AI model that can be used to bring +cutting edge virus analysis to healthcare professionals around the world. The +model's accuracy assessment revealed promising results, with room for +improvement primarily related to data organization. In addition, the severity +index serves as a valuable tool to gain a broad overview of patient care needs, +aligning with healthcare professionals' preference for broader categorizations. + +
+
+ comment: 7 pages, 2 figures, 5 listings +
+
+
+
+
+ + ♻ ☆ ADA-GP: Accelerating DNN Training By Adaptive Gradient Prediction + + +
+ Neural network training is inherently sequential where the layers finish the +forward propagation in succession, followed by the calculation and +back-propagation of gradients (based on a loss function) starting from the last +layer. The sequential computations significantly slow down neural network +training, especially the deeper ones. Prediction has been successfully used in +many areas of computer architecture to speed up sequential processing. +Therefore, we propose ADA-GP, which uses gradient prediction adaptively to +speed up deep neural network (DNN) training while maintaining accuracy. ADA-GP +works by incorporating a small neural network to predict gradients for +different layers of a DNN model. ADA-GP uses a novel tensor reorganization +method to make it feasible to predict a large number of gradients. ADA-GP +alternates between DNN training using backpropagated gradients and DNN training +using predicted gradients. ADA-GP adaptively adjusts when and for how long +gradient prediction is used to strike a balance between accuracy and +performance. Last but not least, we provide a detailed hardware extension in a +typical DNN accelerator to realize the speed up potential from gradient +prediction. Our extensive experiments with fifteen DNN models show that ADA-GP +can achieve an average speed up of 1.47X with similar or even higher accuracy +than the baseline models. Moreover, it consumes, on average, 34% less energy +due to reduced off-chip memory accesses compared to the baseline accelerator. + +
+
+ comment: 13 pages, 21 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Finding emergence in data by maximizing effective information + + +
+ Quantifying emergence and modeling emergent dynamics in a data-driven manner +for complex dynamical systems is challenging due to the lack of direct +observations at the micro-level. Thus, it's crucial to develop a framework to +identify emergent phenomena and capture emergent dynamics at the macro-level +using available data. Inspired by the theory of causal emergence (CE), this +paper introduces a machine learning framework to learn macro-dynamics in an +emergent latent space and quantify the degree of CE. The framework maximizes +effective information, resulting in a macro-dynamics model with enhanced causal +effects. Experimental results on simulated and real data demonstrate the +effectiveness of the proposed framework. It quantifies degrees of CE +effectively under various conditions and reveals distinct influences of +different noise types. It can learn a one-dimensional coarse-grained +macro-state from fMRI data, to represent complex neural activities during movie +clip viewing. Furthermore, improved generalization to different test +environments is observed across all simulation data. + +
+
+
+
+
+ + ♻ ☆ SELF: Language-Driven Self-Evolution for Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable versatility across +various domains. To further advance LLMs, we propose 'SELF' (Self-Evolution +with Language Feedback), a novel approach that enables LLMs to self-improve +through self-reflection, akin to human learning processes. SELF initiates with +a meta-skill learning process that equips the LLMs with capabilities for +self-feedback and self-refinement. Subsequently, the model undergoes an +iterative process of self-evolution. In each iteration, it utilizes an +unlabeled dataset of instructions to generate initial responses. These +responses are enhanced through self-feedback and self-refinement. The model is +then fine-tuned using this enhanced data. The model undergoes progressive +improvement through this iterative self-evolution process. Moreover, the SELF +framework enables the model to apply self-refinement during inference, which +further improves response quality. Our experiments in mathematics and general +tasks demonstrate that SELF can enhance the capabilities of LLMs without human +intervention. The SELF framework indicates a promising direction for the +autonomous evolution of LLMs, transitioning them from passive information +receivers to active participants in their development. + +
+
+ comment: 17 pages, 4 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Improving the Robustness of Transformer-based Large Language Models with + Dynamic Attention + + +
+ Transformer-based models, such as BERT and GPT, have been widely adopted in +natural language processing (NLP) due to their exceptional performance. +However, recent studies show their vulnerability to textual adversarial attacks +where the model's output can be misled by intentionally manipulating the text +inputs. Despite various methods that have been proposed to enhance the model's +robustness and mitigate this vulnerability, many require heavy consumption +resources (e.g., adversarial training) or only provide limited protection +(e.g., defensive dropout). In this paper, we propose a novel method called +dynamic attention, tailored for the transformer architecture, to enhance the +inherent robustness of the model itself against various adversarial attacks. +Our method requires no downstream task knowledge and does not incur additional +costs. The proposed dynamic attention consists of two modules: (I) attention +rectification, which masks or weakens the attention value of the chosen tokens, +and (ii) dynamic modeling, which dynamically builds the set of candidate +tokens. Extensive experiments demonstrate that dynamic attention significantly +mitigates the impact of adversarial attacks, improving up to 33\% better +performance than previous methods against widely-used adversarial attacks. The +model-level design of dynamic attention enables it to be easily combined with +other defense methods (e.g., adversarial training) to further enhance the +model's robustness. Furthermore, we demonstrate that dynamic attention +preserves the state-of-the-art robustness space of the original model compared +to other dynamic modeling methods. + +
+
+
+
+
+ + ♻ ☆ GlycoNMR: Dataset and benchmarks for NMR chemical shift prediction of + carbohydrates with graph neural networks + + +
+ Molecular representation learning (MRL) is a powerful tool for bridging the +gap between machine learning and chemical sciences, as it converts molecules +into numerical representations while preserving their chemical features. These +encoded representations serve as a foundation for various downstream +biochemical studies, including property prediction and drug design. MRL has had +great success with proteins and general biomolecule datasets. Yet, in the +growing sub-field of glycoscience (the study of carbohydrates, where longer +carbohydrates are also called glycans), MRL methods have been barely explored. +This under-exploration can be primarily attributed to the limited availability +of comprehensive and well-curated carbohydrate-specific datasets and a lack of +Machine learning (ML) pipelines specifically tailored to meet the unique +problems presented by carbohydrate data. Since interpreting and annotating +carbohydrate-specific data is generally more complicated than protein data, +domain experts are usually required to get involved. The existing MRL methods, +predominately optimized for proteins and small biomolecules, also cannot be +directly used in carbohydrate applications without special modifications. To +address this challenge, accelerate progress in glycoscience, and enrich the +data resources of the MRL community, we introduce GlycoNMR. GlycoNMR contains +two laboriously curated datasets with 2,609 carbohydrate structures and 211,543 +annotated nuclear magnetic resonance (NMR) chemical shifts for precise +atomic-level prediction. We tailored carbohydrate-specific features and adapted +existing MRL models to tackle this problem effectively. For illustration, we +benchmark four modified MRL models on our new datasets. + +
+
+
+
+
+ + ♻ ☆ Effective Backdoor Mitigation Depends on the Pre-training Objective NeurIPS 2023 + + +
+ Despite the advanced capabilities of contemporary machine learning (ML) +models, they remain vulnerable to adversarial and backdoor attacks. This +vulnerability is particularly concerning in real-world deployments, where +compromised models may exhibit unpredictable behavior in critical scenarios. +Such risks are heightened by the prevalent practice of collecting massive, +internet-sourced datasets for pre-training multimodal models, as these datasets +may harbor backdoors. Various techniques have been proposed to mitigate the +effects of backdooring in these models such as CleanCLIP which is the current +state-of-the-art approach. In this work, we demonstrate that the efficacy of +CleanCLIP in mitigating backdoors is highly dependent on the particular +objective used during model pre-training. We observe that stronger pre-training +objectives correlate with harder to remove backdoors behaviors. We show this by +training multimodal models on two large datasets consisting of 3 million (CC3M) +and 6 million (CC6M) datapoints, under various pre-training objectives, +followed by poison removal using CleanCLIP. We find that CleanCLIP is +ineffective when stronger pre-training objectives are used, even with extensive +hyperparameter tuning. Our findings underscore critical considerations for ML +practitioners who pre-train models using large-scale web-curated data and are +concerned about potential backdoor threats. Notably, our results suggest that +simpler pre-training objectives are more amenable to effective backdoor +removal. This insight is pivotal for practitioners seeking to balance the +trade-offs between using stronger pre-training objectives and security against +backdoor attacks. + +
+
+ comment: Accepted for oral presentation at BUGS workshop @ NeurIPS 2023 + (https://neurips2023-bugs.github.io/) +
+
+
+
+
+
+
+
+ + Multimedia 12 + +
+
+
+ + ☆ VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion + Models + + +
+ Diffusion models have achieved significant success in image and video +generation. This motivates a growing interest in video editing tasks, where +videos are edited according to provided text descriptions. However, most +existing approaches only focus on video editing for short clips and rely on +time-consuming tuning or inference. We are the first to propose Video +Instruction Diffusion (VIDiff), a unified foundation model designed for a wide +range of video tasks. These tasks encompass both understanding tasks (such as +language-guided video object segmentation) and generative tasks (video editing +and enhancement). Our model can edit and translate the desired results within +seconds based on user instructions. Moreover, we design an iterative +auto-regressive method to ensure consistency in editing and enhancing long +videos. We provide convincing generative results for diverse input videos and +written instructions, both qualitatively and quantitatively. More examples can +be found at our website https://ChenHsing.github.io/VIDiff. + +
+
+
+
+
+ + ☆ Motion-Conditioned Image Animation for Video Editing + + +
+ We introduce MoCA, a Motion-Conditioned Image Animation approach for video +editing. It leverages a simple decomposition of the video editing problem into +image editing followed by motion-conditioned image animation. Furthermore, +given the lack of robust evaluation datasets for video editing, we introduce a +new benchmark that measures edit capability across a wide variety of tasks, +such as object replacement, background changes, style changes, and motion +edits. We present a comprehensive human evaluation of the latest video editing +methods along with MoCA, on our proposed benchmark. MoCA establishes a new +state-of-the-art, demonstrating greater human preference win-rate, and +outperforming notable recent approaches including Dreamix (63%), MasaCtrl +(75%), and Tune-A-Video (72%), with especially significant improvements for +motion edits. + +
+
+ comment: Project page: https://facebookresearch.github.io/MoCA +
+
+
+
+
+ + ☆ Automated interpretation of congenital heart disease from multi-view + echocardiograms + + +
+ Congenital heart disease (CHD) is the most common birth defect and the +leading cause of neonate death in China. Clinical diagnosis can be based on the +selected 2D key-frames from five views. Limited by the availability of +multi-view data, most methods have to rely on the insufficient single view +analysis. This study proposes to automatically analyze the multi-view +echocardiograms with a practical end-to-end framework. We collect the five-view +echocardiograms video records of 1308 subjects (including normal controls, +ventricular septal defect (VSD) patients and atrial septal defect (ASD) +patients) with both disease labels and standard-view key-frame labels. +Depthwise separable convolution-based multi-channel networks are adopted to +largely reduce the network parameters. We also approach the imbalanced class +problem by augmenting the positive training samples. Our 2D key-frame model can +diagnose CHD or negative samples with an accuracy of 95.4\%, and in negative, +VSD or ASD classification with an accuracy of 92.3\%. To further alleviate the +work of key-frame selection in real-world implementation, we propose an +adaptive soft attention scheme to directly explore the raw video data. Four +kinds of neural aggregation methods are systematically investigated to fuse the +information of an arbitrary number of frames in a video. Moreover, with a view +detection module, the system can work without the view records. Our video-based +model can diagnose with an accuracy of 93.9\% (binary classification), and +92.1\% (3-class classification) in a collected 2D video testing set, which does +not need key-frame selection and view annotation in testing. The detailed +ablation study and the interpretability analysis are provided. + +
+
+ comment: Published in Medical Image Analysis +
+
+
+
+
+ + ☆ Multi-task learning with cross-task consistency for improved depth + estimation in colonoscopy + + +
+ Colonoscopy screening is the gold standard procedure for assessing +abnormalities in the colon and rectum, such as ulcers and cancerous polyps. +Measuring the abnormal mucosal area and its 3D reconstruction can help quantify +the surveyed area and objectively evaluate disease burden. However, due to the +complex topology of these organs and variable physical conditions, for example, +lighting, large homogeneous texture, and image modality estimating distance +from the camera aka depth) is highly challenging. Moreover, most colonoscopic +video acquisition is monocular, making the depth estimation a non-trivial +problem. While methods in computer vision for depth estimation have been +proposed and advanced on natural scene datasets, the efficacy of these +techniques has not been widely quantified on colonoscopy datasets. As the +colonic mucosa has several low-texture regions that are not well pronounced, +learning representations from an auxiliary task can improve salient feature +extraction, allowing estimation of accurate camera depths. In this work, we +propose to develop a novel multi-task learning (MTL) approach with a shared +encoder and two decoders, namely a surface normal decoder and a depth estimator +decoder. Our depth estimator incorporates attention mechanisms to enhance +global context awareness. We leverage the surface normal prediction to improve +geometric feature extraction. Also, we apply a cross-task consistency loss +among the two geometrically related tasks, surface normal and camera depth. We +demonstrate an improvement of 14.17% on relative error and 10.4% improvement on +$\delta_{1}$ accuracy over the most accurate baseline state-of-the-art BTS +approach. All experiments are conducted on a recently released C3VD dataset; +thus, we provide a first benchmark of state-of-the-art methods. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ HKUST at SemEval-2023 Task 1: Visual Word Sense Disambiguation with + Context Augmentation and Visual Assistance + + +
+ Visual Word Sense Disambiguation (VWSD) is a multi-modal task that aims to +select, among a batch of candidate images, the one that best entails the target +word's meaning within a limited context. In this paper, we propose a +multi-modal retrieval framework that maximally leverages pretrained +Vision-Language models, as well as open knowledge bases and datasets. Our +system consists of the following key components: (1) Gloss matching: a +pretrained bi-encoder model is used to match contexts with proper senses of the +target words; (2) Prompting: matched glosses and other textual information, +such as synonyms, are incorporated using a prompting template; (3) Image +retrieval: semantically matching images are retrieved from large open datasets +using prompts as queries; (4) Modality fusion: contextual information from +different modalities are fused and used for prediction. Although our system +does not produce the most competitive results at SemEval-2023 Task 1, we are +still able to beat nearly half of the teams. More importantly, our experiments +reveal acute insights for the field of Word Sense Disambiguation (WSD) and +multi-modal learning. Our code is available on GitHub. + +
+
+
+
+
+ + ☆ mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large + Language Model + + +
+ Recently, the strong text creation ability of Large Language Models(LLMs) has +given rise to many tools for assisting paper reading or even writing. However, +the weak diagram analysis abilities of LLMs or Multimodal LLMs greatly limit +their application scenarios, especially for scientific academic paper writing. +In this work, towards a more versatile copilot for academic paper writing, we +mainly focus on strengthening the multi-modal diagram analysis ability of +Multimodal LLMs. By parsing Latex source files of high-quality papers, we +carefully build a multi-modal diagram understanding dataset M-Paper. By +aligning diagrams in the paper with related paragraphs, we construct +professional diagram analysis samples for training and evaluation. M-Paper is +the first dataset to support joint comprehension of multiple scientific +diagrams, including figures and tables in the format of images or Latex codes. +Besides, to better align the copilot with the user's intention, we introduce +the `outline' as the control signal, which could be directly given by the user +or revised based on auto-generated ones. Comprehensive experiments with a +state-of-the-art Mumtimodal LLM demonstrate that training on our dataset shows +stronger scientific diagram understanding performance, including diagram +captioning, diagram analysis, and outline recommendation. The dataset, code, +and model are available at +https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl. + +
+
+ comment: 20 pages, 12 figures. arXiv admin note: text overlap with + arXiv:2305.15225 by other authors +
+
+
+
+
+ + ☆ DKiS: Decay weight invertible image steganography with private key + + +
+ Image steganography, the practice of concealing information within another +image, traditionally faces security challenges when its methods become publicly +known. To counteract this, we introduce a novel private key-based image +steganography technique. This approach ensures the security of hidden +information, requiring a corresponding private key for access, irrespective of +the public knowledge of the steganography method. We present experimental +evidence demonstrating our method's effectiveness, showcasing its real-world +applicability. Additionally, we identified a critical challenge in the +invertible image steganography process: the transfer of non-essential, or +`garbage', information from the secret to the host pipeline. To address this, +we introduced the decay weight to control the information transfer, filtering +out irrelevant data and enhancing the performance of image steganography. Our +code is publicly accessible at https://github.com/yanghangAI/DKiS, and a +practical demonstration is available at http://yanghang.site/hidekey. + +
+
+
+
+
+ + ☆ FS-BAND: A Frequency-Sensitive Banding Detector + + +
+ Banding artifact, as known as staircase-like contour, is a common quality +annoyance that happens in compression, transmission, etc. scenarios, which +largely affects the user's quality of experience (QoE). The banding distortion +typically appears as relatively small pixel-wise variations in smooth +backgrounds, which is difficult to analyze in the spatial domain but easily +reflected in the frequency domain. In this paper, we thereby study the banding +artifact from the frequency aspect and propose a no-reference banding detection +model to capture and evaluate banding artifacts, called the Frequency-Sensitive +BANding Detector (FS-BAND). The proposed detector is able to generate a +pixel-wise banding map with a perception correlated quality score. Experimental +results show that the proposed FS-BAND method outperforms state-of-the-art +image quality assessment (IQA) approaches with higher accuracy in banding +classification task. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2311.17752 +
+
+
+
+
+ + ☆ mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large + Language Model + + +
+ Recently, the strong text creation ability of Large Language Models(LLMs) has +given rise to many tools for assisting paper reading or even writing. However, +the weak diagram analysis abilities of LLMs or Multimodal LLMs greatly limit +their application scenarios, especially for scientific academic paper writing. +In this work, towards a more versatile copilot for academic paper writing, we +mainly focus on strengthening the multi-modal diagram analysis ability of +Multimodal LLMs. By parsing Latex source files of high-quality papers, we +carefully build a multi-modal diagram understanding dataset M-Paper. By +aligning diagrams in the paper with related paragraphs, we construct +professional diagram analysis samples for training and evaluation. M-Paper is +the first dataset to support joint comprehension of multiple scientific +diagrams, including figures and tables in the format of images or Latex codes. +Besides, to better align the copilot with the user's intention, we introduce +the `outline' as the control signal, which could be directly given by the user +or revised based on auto-generated ones. Comprehensive experiments with a +state-of-the-art Mumtimodal LLM demonstrate that training on our dataset shows +stronger scientific diagram understanding performance, including diagram +captioning, diagram analysis, and outline recommendation. The dataset, code, +and model are available at +https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Differentiable JPEG: The Devil is in the Details WACV 2024 + + +
+ JPEG remains one of the most widespread lossy image coding methods. However, +the non-differentiable nature of JPEG restricts the application in deep +learning pipelines. Several differentiable approximations of JPEG have recently +been proposed to address this issue. This paper conducts a comprehensive review +of existing diff. JPEG approaches and identifies critical details that have +been missed by previous methods. To this end, we propose a novel diff. JPEG +approach, overcoming previous limitations. Our approach is differentiable +w.r.t. the input image, the JPEG quality, the quantization tables, and the +color conversion parameters. We evaluate the forward and backward performance +of our diff. JPEG approach against existing methods. Additionally, extensive +ablations are performed to evaluate crucial design choices. Our proposed diff. +JPEG resembles the (non-diff.) reference implementation best, significantly +surpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For +strong compression rates, we can even improve PSNR by $9.51$dB. Strong +adversarial attack results are yielded by our diff. JPEG, demonstrating the +effective gradient approximation. Our code is available at +https://github.com/necla-ml/Diff-JPEG. + +
+
+ comment: Accepted at WACV 2024. Project page: + https://christophreich1996.github.io/differentiable_jpeg/ +
+
+
+
+
+ + ♻ ☆ Generating More Pertinent Captions by Leveraging Semantics and Style on + Multi-Source Datasets + + +
+ This paper addresses the task of generating fluent descriptions by training +on a non-uniform combination of data sources, containing both human-annotated +and web-collected captions. Large-scale datasets with noisy image-text pairs, +indeed, provide a sub-optimal source of supervision because of their +low-quality descriptive style, while human-annotated datasets are cleaner but +smaller in scale. To get the best of both worlds, we propose to leverage and +separate semantics and descriptive style through the incorporation of a style +token and keywords extracted through a retrieval component. The proposed model +avoids the need of object detectors, is trained with a single objective of +prompt language modeling, and can replicate the style of human-collected +captions while training on sources with different input styles. Experimentally, +the model shows a strong capability of recognizing real-world concepts and +producing high-quality captions. Extensive experiments are performed on +different image captioning datasets, including CC3M, nocaps, and the +competitive COCO dataset, where our model consistently outperforms baselines +and state-of-the-art approaches. + +
+
+ comment: Accepted to IJCV +
+
+
+
+
+ + ♻ ☆ Language Models as Black-Box Optimizers for Vision-Language Models + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities on downstream tasks when fine-tuned with +minimal data. However, many VLMs rely on proprietary data and are not +open-source, which restricts the use of white-box approaches for fine-tuning. +As such, we aim to develop a black-box approach to optimize VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or even output logits. We propose employing chat-based LLMs +to search for the best text prompt for VLMs. Specifically, we adopt an +automatic hill-climbing procedure that converges to an effective prompt by +evaluating the performance of current prompts and asking LLMs to refine them +based on textual feedback, all within a conversational process without +human-in-the-loop. In a challenging 1-shot image classification setup, our +simple approach surpasses the white-box continuous prompting method (CoOp) by +an average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms both human-engineered and LLM-generated prompts. We highlight the +advantage of conversational feedback that incorporates both positive and +negative prompts, suggesting that LLMs can utilize the implicit gradient +direction in textual feedback for a more efficient search. In addition, we find +that the text prompts generated through our strategy are not only more +interpretable but also transfer well across different VLM architectures in a +black-box manner. Lastly, we demonstrate our framework on a state-of-the-art +black-box VLM (DALL-E 3) for text-to-image optimization. + +
+
+ comment: Project site: llm-can-optimize-vlm.github.io +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 74 + +
+
+
+ + ☆ Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis + + +
+ Hallucinations and unfaithful synthesis due to inaccurate prompts with +insufficient semantic details are widely observed in multimodal generative +models. A prevalent strategy to align multiple modalities is to fine-tune the +generator with a large number of annotated text-image pairs. However, such a +procedure is labor-consuming and resource-draining. The key question we ask is: +can we enhance the quality and faithfulness of text-driven generative models +beyond extensive text-image pair annotations? To address this question, we +propose Knowledge Pursuit Prompting (KPP), a zero-shot framework that +iteratively incorporates external knowledge to help generators produce reliable +visual content. Instead of training generators to handle generic prompts, KPP +employs a recursive knowledge query process to gather informative external +facts from the knowledge base, instructs a language model to compress the +acquired knowledge for prompt refinement, and utilizes text-driven generators +for visual synthesis. The entire process is zero-shot, without accessing the +architectures and parameters of generative models. We evaluate the framework +across multiple text-driven generative tasks (image, 3D rendering, and video) +on datasets of different domains. We further demonstrate the extensibility and +adaptability of KPP through varying foundation model bases and instructions. +Our results show that KPP is capable of generating faithful and semantically +rich content across diverse visual domains, offering a promising solution to +improve multimodal generative models. + +
+
+
+
+
+ + ☆ A Pipeline For Discourse Circuits From CCG + + +
+ There is a significant disconnect between linguistic theory and modern NLP +practice, which relies heavily on inscrutable black-box architectures. +DisCoCirc is a newly proposed model for meaning that aims to bridge this +divide, by providing neuro-symbolic models that incorporate linguistic +structure. DisCoCirc represents natural language text as a `circuit' that +captures the core semantic information of the text. These circuits can then be +interpreted as modular machine learning models. Additionally, DisCoCirc fulfils +another major aim of providing an NLP model that can be implemented on +near-term quantum computers. + In this paper we describe a software pipeline that converts English text to +its DisCoCirc representation. The pipeline achieves coverage over a large +fragment of the English language. It relies on Combinatory Categorial Grammar +(CCG) parses of the input text as well as coreference resolution information. +This semantic and syntactic information is used in several steps to convert the +text into a simply-typed $\lambda$-calculus term, and then into a circuit +diagram. This pipeline will enable the application of the DisCoCirc framework +to NLP tasks, using both classical and quantum approaches. + +
+
+ comment: 39 pages, many figures +
+
+
+
+
+ + ☆ Look Before You Leap: Unveiling the Power of GPT-4V in Robotic + Vision-Language Planning + + +
+ In this study, we are interested in imbuing robots with the capability of +physically-grounded task planning. Recent advancements have shown that large +language models (LLMs) possess extensive knowledge useful in robotic tasks, +especially in reasoning and planning. However, LLMs are constrained by their +lack of world grounding and dependence on external affordance models to +perceive environmental information, which cannot jointly reason with LLMs. We +argue that a task planner should be an inherently grounded, unified multimodal +system. To this end, we introduce Robotic Vision-Language Planning (ViLa), a +novel approach for long-horizon robotic planning that leverages vision-language +models (VLMs) to generate a sequence of actionable steps. ViLa directly +integrates perceptual data into its reasoning and planning process, enabling a +profound understanding of commonsense knowledge in the visual world, including +spatial layouts and object attributes. It also supports flexible multimodal +goal specification and naturally incorporates visual feedback. Our extensive +evaluation, conducted in both real-robot and simulated environments, +demonstrates ViLa's superiority over existing LLM-based planners, highlighting +its effectiveness in a wide array of open-world manipulation tasks. + +
+
+
+
+
+ + ☆ Higher-Order DisCoCat (Peirce-Lambek-Montague semantics) + + +
+ We propose a new definition of higher-order DisCoCat (categorical +compositional distributional) models where the meaning of a word is not a +diagram, but a diagram-valued higher-order function. Our models can be seen as +a variant of Montague semantics based on a lambda calculus where the primitives +act on string diagrams rather than logical formulae. As a special case, we show +how to translate from the Lambek calculus into Peirce's system beta for +first-order logic. This allows us to give a purely diagrammatic treatment of +higher-order and non-linear processes in natural language semantics: adverbs, +prepositions, negation and quantifiers. The theoretical definition presented in +this article comes with a proof-of-concept implementation in DisCoPy, the +Python library for string diagrams. + +
+
+ comment: 19 pages, 11 figures +
+
+
+
+
+ + ☆ DSS: Synthesizing long Digital Ink using Data augmentation, Style + encoding and Split generation + + +
+ As text generative models can give increasingly long answers, we tackle the +problem of synthesizing long text in digital ink. We show that the commonly +used models for this task fail to generalize to long-form data and how this +problem can be solved by augmenting the training data, changing the model +architecture and the inference procedure. These methods use contrastive +learning technique and are tailored specifically for the handwriting domain. +They can be applied to any encoder-decoder model that works with digital ink. +We demonstrate that our method reduces the character error rate on long-form +English data by half compared to baseline RNN and by 16% compared to the +previous approach that aims at addressing the same problem. We show that all +three parts of the method improve recognizability of generated inks. In +addition, we evaluate synthesized data in a human study and find that people +perceive most of generated data as real. + +
+
+
+
+
+ + ☆ Supervising the Centroid Baseline for Extractive Multi-Document + Summarization + + +
+ The centroid method is a simple approach for extractive multi-document +summarization and many improvements to its pipeline have been proposed. We +further refine it by adding a beam search process to the sentence selection and +also a centroid estimation attention model that leads to improved results. We +demonstrate this in several multi-document summarization datasets, including in +a multilingual scenario. + +
+
+ comment: Accepted at "The 4th New Frontiers in Summarization (with LLMs) + Workshop" +
+
+
+
+
+ + ☆ Mukhyansh: A Headline Generation Dataset for Indic Languages ACL + + +
+ The task of headline generation within the realm of Natural Language +Processing (NLP) holds immense significance, as it strives to distill the true +essence of textual content into concise and attention-grabbing summaries. While +noteworthy progress has been made in headline generation for widely spoken +languages like English, there persist numerous challenges when it comes to +generating headlines in low-resource languages, such as the rich and diverse +Indian languages. A prominent obstacle that specifically hinders headline +generation in Indian languages is the scarcity of high-quality annotated data. +To address this crucial gap, we proudly present Mukhyansh, an extensive +multilingual dataset, tailored for Indian language headline generation. +Comprising an impressive collection of over 3.39 million article-headline +pairs, Mukhyansh spans across eight prominent Indian languages, namely Telugu, +Tamil, Kannada, Malayalam, Hindi, Bengali, Marathi, and Gujarati. We present a +comprehensive evaluation of several state-of-the-art baseline models. +Additionally, through an empirical analysis of existing works, we demonstrate +that Mukhyansh outperforms all other models, achieving an impressive average +ROUGE-L score of 31.43 across all 8 languages. + +
+
+ comment: Accepted at PACLIC 2023 +
+
+
+
+
+ + ☆ End-to-end Joint Rich and Normalized ASR with a limited amount of rich + training data ICASSP 2024 + + +
+ Joint rich and normalized automatic speech recognition (ASR), that produces +transcriptions both with and without punctuation and capitalization, remains a +challenge. End-to-end (E2E) ASR models offer both convenience and the ability +to perform such joint transcription of speech. Training such models requires +paired speech and rich text data, which is not widely available. In this paper, +we compare two different approaches to train a stateless Transducer-based E2E +joint rich and normalized ASR system, ready for streaming applications, with a +limited amount of rich labeled data. The first approach uses a language model +to generate pseudo-rich transcriptions of normalized training data. The second +approach uses a single decoder conditioned on the type of the output. The first +approach leads to E2E rich ASR which perform better on out-of-domain data, with +up to 9% relative reduction in errors. The second approach demonstrates the +feasibility of an E2E joint rich and normalized ASR system using as low as 5% +rich training data with moderate (2.42% absolute) increase in errors. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ SenTest: Evaluating Robustness of Sentence Encoders + + +
+ Contrastive learning has proven to be an effective method for pre-training +models using weakly labeled data in the vision domain. Sentence transformers +are the NLP counterparts to this architecture, and have been growing in +popularity due to their rich and effective sentence representations. Having +effective sentence representations is paramount in multiple tasks, such as +information retrieval, retrieval augmented generation (RAG), and sentence +comparison. Keeping in mind the deployability factor of transformers, +evaluating the robustness of sentence transformers is of utmost importance. +This work focuses on evaluating the robustness of the sentence encoders. We +employ several adversarial attacks to evaluate its robustness. This system uses +character-level attacks in the form of random character substitution, +word-level attacks in the form of synonym replacement, and sentence-level +attacks in the form of intra-sentence word order shuffling. The results of the +experiments strongly undermine the robustness of sentence encoders. The models +produce significantly different predictions as well as embeddings on perturbed +datasets. The accuracy of the models can fall up to 15 percent on perturbed +datasets as compared to unperturbed datasets. Furthermore, the experiments +demonstrate that these embeddings does capture the semantic and syntactic +structure (sentence order) of sentences. However, existing supervised +classification strategies fail to leverage this information, and merely +function as n-gram detectors. + +
+
+
+
+
+ + ☆ How to Build an AI Tutor that Can Adapt to Any Course and Provide + Accurate Answers Using Large Language Model and Retrieval-Augmented + Generation + + +
+ Artificial intelligence is transforming education through data-driven, +personalized learning solutions. This paper introduces AI Tutor, an innovative +web application that provides personalized tutoring in any subject using +state-of-the-art Large Language Model (LLM). AI Tutor ingests course materials +to construct an adaptive knowledge base tailored to the course. When students +pose questions, it retrieves the most relevant information and generates +detailed, conversational responses citing supporting evidence. The system is +powered by advanced large language models and Retrieval-Augmented Generation +(RAG) techniques for accurate, natural question answering. We present a +fully-functional web interface and video demonstration that showcase AI Tutor's +versatility across diverse subjects and its ability to produce pedagogically +cogent responses. While an initial prototype, this work represents a pioneering +step toward AI-enabled tutoring systems that can democratize access to +high-quality, customized educational support. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ AviationGPT: A Large Language Model for the Aviation Domain + + +
+ The advent of ChatGPT and GPT-4 has captivated the world with large language +models (LLMs), demonstrating exceptional performance in question-answering, +summarization, and content generation. The aviation industry is characterized +by an abundance of complex, unstructured text data, replete with technical +jargon and specialized terminology. Moreover, labeled data for model building +are scarce in this domain, resulting in low usage of aviation text data. The +emergence of LLMs presents an opportunity to transform this situation, but +there is a lack of LLMs specifically designed for the aviation domain. To +address this gap, we propose AviationGPT, which is built on open-source LLaMA-2 +and Mistral architectures and continuously trained on a wealth of carefully +curated aviation datasets. Experimental results reveal that AviationGPT offers +users multiple advantages, including the versatility to tackle diverse natural +language processing (NLP) problems (e.g., question-answering, summarization, +document writing, information extraction, report querying, data cleaning, and +interactive data exploration). It also provides accurate and contextually +relevant responses within the aviation domain and significantly improves +performance (e.g., over a 40% performance gain in tested cases). With +AviationGPT, the aviation industry is better equipped to address more complex +research problems and enhance the efficiency and safety of National Airspace +System (NAS) operations. + +
+
+
+
+
+ + ☆ Improving Minority Stress Detection with Emotions + + +
+ Psychological stress detection is an important task for mental healthcare +research, but there has been little prior work investigating the effectiveness +of psychological stress models on minority individuals, who are especially +vulnerable to poor mental health outcomes. In this work, we use the related +task of minority stress detection to evaluate the ability of psychological +stress models to understand the language of sexual and gender minorities. We +find that traditional psychological stress models underperform on minority +stress detection, and we propose using emotion-infused models to reduce that +performance disparity. We further demonstrate that multi-task psychological +stress models outperform the current state-of-the-art for minority stress +detection without directly training on minority stress data. We provide +explanatory analysis showing that minority communities have different +distributions of emotions than the general population and that emotion-infused +models improve the performance of stress models on underrepresented groups +because of their effectiveness in low-data environments, and we propose that +integrating emotions may benefit underrepresented groups in other mental health +detection tasks. + +
+
+ comment: 9 pages, 6 figures, under review +
+
+
+
+
+ + ☆ TimeBench: A Comprehensive Evaluation of Temporal Reasoning Abilities in + Large Language Models + + +
+ Understanding time is a pivotal aspect of human cognition, crucial in the +broader framework of grasping the intricacies of the world. Previous studies +typically focus on specific aspects of time, lacking a comprehensive temporal +reasoning benchmark. To address this issue, we propose TimeBench, a +comprehensive hierarchical temporal reasoning benchmark that covers a broad +spectrum of temporal reasoning phenomena, which provides a thorough evaluation +for investigating the temporal reasoning capabilities of large language models. +We conduct extensive experiments on popular LLMs, such as GPT-4, LLaMA2, and +Mistral, incorporating chain-of-thought prompting. Our experimental results +indicate a significant performance gap between the state-of-the-art LLMs and +humans, highlighting that there is still a considerable distance to cover in +temporal reasoning. We aspire for TimeBench to serve as a comprehensive +benchmark, fostering research in temporal reasoning for LLMs. Our resource is +available at https://github.com/zchuz/TimeBench + +
+
+ comment: Resources at: https://github.com/zchuz/TimeBench +
+
+
+
+
+ + ☆ VIM: Probing Multimodal Large Language Models for Visual Embedded + Instruction Following + + +
+ We introduce VISUAL EMBEDDED INSTRUCTION (VIM), a new framework designed to +evaluate the visual instruction following capability of Multimodal Large +Language Models (MLLMs). As illustrated in Figure 2, VIM challenges the MLLMs +by embedding the instructions into the visual scenes, demanding strong visual +interpretative skills for instruction following. We adapt VIM to various +benchmarks, including VQAv2, MME, MM-Vet, and RefCOCO series, compose a VIM +bench, and probe diverse MLLMs across three distinct in-context learning +settings: Zero Shot, One Shot, and Pair Shot. We observe that there is a +significant performance disparity between the open-source MLLMs and GPT-4V, +implying that their proficiency in visual instruction comprehension is not up +to par. Our results highlight a promising direction for the enhancement of +MLLMs capabilities on instruction following. We aim VIM to serve as a useful +norm for advancing the state of the art and driving further progress in the +field. + +
+
+ comment: 20 pages, 8 figures, 20 tables +
+
+
+
+
+ + ☆ Introduction to Transformers: an NLP Perspective + + +
+ Transformers have dominated empirical machine learning models of natural +language processing. In this paper, we introduce basic concepts of Transformers +and present key techniques that form the recent advances of these models. This +includes a description of the standard Transformer architecture, a series of +model refinements, and common applications. Given that Transformers and related +deep learning techniques might be evolving in ways we have never seen, we +cannot dive into all the model details or cover all the technical areas. +Instead, we focus on just those concepts that are helpful for gaining a good +understanding of Transformers and their variants. We also summarize the key +ideas that impact this field, thereby yielding some insights into the strengths +and limitations of these models. + +
+
+ comment: 119 pages and 21 figures +
+
+
+
+
+ + ☆ LanGWM: Language Grounded World Model + + +
+ Recent advances in deep reinforcement learning have showcased its potential +in tackling complex tasks. However, experiments on visual control tasks have +revealed that state-of-the-art reinforcement learning models struggle with +out-of-distribution generalization. Conversely, expressing higher-level +concepts and global contexts is relatively easy using language. + Building upon recent success of the large language models, our main objective +is to improve the state abstraction technique in reinforcement learning by +leveraging language for robust action selection. Specifically, we focus on +learning language-grounded visual features to enhance the world model learning, +a model-based reinforcement learning technique. + To enforce our hypothesis explicitly, we mask out the bounding boxes of a few +objects in the image observation and provide the text prompt as descriptions +for these masked objects. Subsequently, we predict the masked objects along +with the surrounding regions as pixel reconstruction, similar to the +transformer-based masked autoencoder approach. + Our proposed LanGWM: Language Grounded World Model achieves state-of-the-art +performance in out-of-distribution test at the 100K interaction steps +benchmarks of iGibson point navigation tasks. Furthermore, our proposed +technique of explicit language-grounded visual representation learning has the +potential to improve models for human-robot interaction because our extracted +visual features are language grounded. + +
+
+
+
+
+ + ☆ Reinforcement Replaces Supervision: Query focused Summarization using + Deep Reinforcement Learning + + +
+ Query-focused Summarization (QfS) deals with systems that generate summaries +from document(s) based on a query. Motivated by the insight that Reinforcement +Learning (RL) provides a generalization to Supervised Learning (SL) for Natural +Language Generation, and thereby performs better (empirically) than SL, we use +an RL-based approach for this task of QfS. Additionally, we also resolve the +conflict of employing RL in Transformers with Teacher Forcing. We develop +multiple Policy Gradient networks, trained on various reward signals: ROUGE, +BLEU, and Semantic Similarity, which lead to a 10-point improvement over the +State-of-the-Art approach on the ROUGE-L metric for a benchmark dataset (ELI5). +We also show performance of our approach in zero-shot setting for another +benchmark dataset (DebatePedia) -- our approach leads to results comparable to +baselines, which were specifically trained on DebatePedia. To aid the RL +training, we propose a better semantic similarity reward, enabled by a novel +Passage Embedding scheme developed using Cluster Hypothesis. Lastly, we +contribute a gold-standard test dataset to further research in QfS and +Long-form Question Answering (LfQA). + +
+
+
+
+
+ + ☆ Enhancing Answer Selection in Community Question Answering with + Pre-trained and Large Language Models + + +
+ Community Question Answering (CQA) becomes increasingly prevalent in recent +years. However, there are a large number of answers, which is difficult for +users to select the relevant answers. Therefore, answer selection is a very +significant subtask of CQA. In this paper, we first propose the Question-Answer +cross attention networks (QAN) with pre-trained models for answer selection and +utilize large language model (LLM) to perform answer selection with knowledge +augmentation. Specifically, we apply the BERT model as the encoder layer to do +pre-training for question subjects, question bodies and answers, respectively, +then the cross attention mechanism selects the most relevant answer for +different questions. Experiments show that the QAN model achieves +state-of-the-art performance on two datasets, SemEval2015 and SemEval2017. +Moreover, we use the LLM to generate external knowledge from questions and +correct answers to achieve knowledge augmentation for the answer selection task +by LLM, while optimizing the prompt of LLM in different aspects. The results +show that the introduction of external knowledge can improve the correct answer +selection rate of LLM on datasets SemEval2015 and SemEval2017. Meanwhile, LLM +can also select the correct answer on more questions by optimized prompt. + +
+
+ comment: 24pages, 4 figures, 14tables +
+
+
+
+
+ + ☆ Mergen: The First Manchu-Korean Machine Translation Model Trained on + Augmented Data + + +
+ The Manchu language, with its roots in the historical Manchurian region of +Northeast China, is now facing a critical threat of extinction, as there are +very few speakers left. In our efforts to safeguard the Manchu language, we +introduce Mergen, the first-ever attempt at a Manchu-Korean Machine Translation +(MT) model. To develop this model, we utilize valuable resources such as the +Manwen Laodang(a historical book) and a Manchu-Korean dictionary. Due to the +scarcity of a Manchu-Korean parallel dataset, we expand our data by employing +word replacement guided by GloVe embeddings, trained on both monolingual and +parallel texts. Our approach is built around an encoder-decoder neural machine +translation model, incorporating a bi-directional Gated Recurrent Unit (GRU) +layer. The experiments have yielded promising results, showcasing a significant +enhancement in Manchu-Korean translation, with a remarkable 20-30 point +increase in the BLEU score. + +
+
+ comment: emnlp2023/mrl2023 +
+
+
+
+
+ + ☆ Taiwan LLM: Bridging the Linguistic Divide with a Culturally Aligned + Language Model + + +
+ In the realm of language models, the nuanced linguistic and cultural +intricacies of Traditional Chinese, as spoken in Taiwan, have been largely +overlooked. This paper introduces Taiwan LLM, a pioneering Large Language Model +that specifically caters to the Traditional Chinese language, with a focus on +the variant used in Taiwan. Leveraging a comprehensive pretraining corpus and +instruction-finetuning datasets, we have developed a model that not only +understands the complexities of Traditional Chinese but also embodies the +cultural context of Taiwan. Taiwan LLM represents the first of its kind, a +model that is not only linguistically accurate but also culturally resonant +with its user base. Our evaluations demonstrate that Taiwan LLM achieves +superior performance in understanding and generating Traditional Chinese text, +outperforming existing models that are predominantly trained on Simplified +Chinese or English. The open-source release of Taiwan LLM invites collaboration +and further innovation, ensuring that the linguistic diversity of Chinese +speakers is embraced and well-served. The model, datasets, and further +resources are made publicly available to foster ongoing research and +development in this field. + +
+
+
+
+
+ + ☆ CLOMO: Counterfactual Logical Modification with Large Language Models + + +
+ In this study, we delve into the realm of counterfactual reasoning +capabilities of large language models (LLMs). Our primary objective is to +cultivate the counterfactual thought processes within LLMs and rigorously +assess these processes for their validity. Specifically, we introduce a novel +task, Counterfactual Logical Modification (CLOMO), and a high-quality +human-annotated benchmark. In this task, LLMs must adeptly alter a given +argumentative text to uphold a predetermined logical relationship. To +effectively evaluate a generation model's counterfactual capabilities, we +propose an innovative evaluation metric, the LogicAware Counterfactual Score to +directly evaluate the natural language output of LLMs instead of modeling the +task as a multiple-choice problem. Analysis shows that the proposed automatic +metric aligns well with human preference. Our experimental results show that +while LLMs demonstrate a notable capacity for logical counterfactual thinking, +there remains a discernible gap between their current abilities and human +performance. + +
+
+
+
+
+ + ☆ TARGET: Template-Transferable Backdoor Attack Against Prompt-based NLP + Models via GPT4 + + +
+ Prompt-based learning has been widely applied in many low-resource NLP tasks +such as few-shot scenarios. However, this paradigm has been shown to be +vulnerable to backdoor attacks. Most of the existing attack methods focus on +inserting manually predefined templates as triggers in the pre-training phase +to train the victim model and utilize the same triggers in the downstream task +to perform inference, which tends to ignore the transferability and +stealthiness of the templates. In this work, we propose a novel approach of +TARGET (Template-trAnsfeRable backdoor attack aGainst prompt-basEd NLP models +via GPT4), which is a data-independent attack method. Specifically, we first +utilize GPT4 to reformulate manual templates to generate tone-strong and normal +templates, and the former are injected into the model as a backdoor trigger in +the pre-training phase. Then, we not only directly employ the above templates +in the downstream task, but also use GPT4 to generate templates with similar +tone to the above templates to carry out transferable attacks. Finally we have +conducted extensive experiments on five NLP datasets and three BERT series +models, with experimental results justifying that our TARGET method has better +attack performance and stealthiness compared to the two-external baseline +methods on direct attacks, and in addition achieves satisfactory attack +capability in the unseen tone-similar templates. + +
+
+
+
+
+ + ☆ VITATECS: A Diagnostic Dataset for Temporal Concept Understanding of + Video-Language Models + + +
+ The ability to perceive how objects change over time is a crucial ingredient +in human intelligence. However, current benchmarks cannot faithfully reflect +the temporal understanding abilities of video-language models (VidLMs) due to +the existence of static visual shortcuts. To remedy this issue, we present +VITATECS, a diagnostic VIdeo-Text dAtaset for the evaluation of TEmporal +Concept underStanding. Specifically, we first introduce a fine-grained taxonomy +of temporal concepts in natural language in order to diagnose the capability of +VidLMs to comprehend different temporal aspects. Furthermore, to disentangle +the correlation between static and temporal information, we generate +counterfactual video descriptions that differ from the original one only in the +specified temporal aspect. We employ a semi-automatic data collection framework +using large language models and human-in-the-loop annotation to obtain +high-quality counterfactual descriptions efficiently. Evaluation of +representative video-language understanding models confirms their deficiency in +temporal understanding, revealing the need for greater emphasis on the temporal +elements in video-language research. + +
+
+ comment: 23 pages, 6 figures, 18 tables, data is available at + https://github.com/lscpku/VITATECS +
+
+
+
+
+ + ☆ Improving the Robustness of Transformer-based Large Language Models with + Dynamic Attention + + +
+ Transformer-based models, such as BERT and GPT, have been widely adopted in +natural language processing (NLP) due to their exceptional performance. +However, recent studies show their vulnerability to textual adversarial attacks +where the model's output can be misled by intentionally manipulating the text +inputs. Despite various methods that have been proposed to enhance the model's +robustness and mitigate this vulnerability, many require heavy consumption +resources (e.g., adversarial training) or only provide limited protection +(e.g., defensive dropout). In this paper, we propose a novel method called +dynamic attention, tailored for the transformer architecture, to enhance the +inherent robustness of the model itself against various adversarial attacks. +Our method requires no downstream task knowledge and does not incur additional +costs. The proposed dynamic attention consists of two modules: (I) attention +rectification, which masks or weakens the attention value of the chosen tokens, +and (ii) dynamic modeling, which dynamically builds the set of candidate +tokens. Extensive experiments demonstrate that dynamic attention significantly +mitigates the impact of adversarial attacks, improving up to 33\% better +performance than previous methods against widely-used adversarial attacks. The +model-level design of dynamic attention enables it to be easily combined with +other defense methods (e.g., adversarial training) to further enhance the +model's robustness. Furthermore, we demonstrate that dynamic attention +preserves the state-of-the-art robustness space of the original model compared +to other dynamic modeling methods. + +
+
+
+
+
+ + ☆ Unveiling the Implicit Toxicity in Large Language Models EMNLP 2023 + + +
+ The open-endedness of large language models (LLMs) combined with their +impressive capabilities may lead to new safety issues when being exploited for +malicious use. While recent studies primarily focus on probing toxic outputs +that can be easily detected with existing toxicity classifiers, we show that +LLMs can generate diverse implicit toxic outputs that are exceptionally +difficult to detect via simply zero-shot prompting. Moreover, we propose a +reinforcement learning (RL) based attacking method to further induce the +implicit toxicity in LLMs. Specifically, we optimize the language model with a +reward that prefers implicit toxic outputs to explicit toxic and non-toxic +ones. Experiments on five widely-adopted toxicity classifiers demonstrate that +the attack success rate can be significantly improved through RL fine-tuning. +For instance, the RL-finetuned LLaMA-13B model achieves an attack success rate +of 90.04% on BAD and 62.85% on Davinci003. Our findings suggest that LLMs pose +a significant threat in generating undetectable implicit toxic outputs. We +further show that fine-tuning toxicity classifiers on the annotated examples +from our attacking method can effectively enhance their ability to detect +LLM-generated implicit toxic language. The code is publicly available at +https://github.com/thu-coai/Implicit-Toxicity. + +
+
+ comment: EMNLP 2023 Main Conference +
+
+
+
+
+ + ☆ CESAR: Automatic Induction of Compositional Instructions for Multi-turn + Dialogs EMNLP 2023 + + +
+ Instruction-based multitasking has played a critical role in the success of +large language models (LLMs) in multi-turn dialog applications. While publicly +available LLMs have shown promising performance, when exposed to complex +instructions with multiple constraints, they lag against state-of-the-art +models like ChatGPT. In this work, we hypothesize that the availability of +large-scale complex demonstrations is crucial in bridging this gap. Focusing on +dialog applications, we propose a novel framework, CESAR, that unifies a large +number of dialog tasks in the same format and allows programmatic induction of +complex instructions without any manual effort. + We apply CESAR on InstructDial, a benchmark for instruction-based dialog +tasks. We further enhance InstructDial with new datasets and tasks and utilize +CESAR to induce complex tasks with compositional instructions. This results in +a new benchmark called InstructDial++, which includes 63 datasets with 86 basic +tasks and 68 composite tasks. Through rigorous experiments, we demonstrate the +scalability of CESAR in providing rich instructions. Models trained on +InstructDial++ can follow compositional prompts, such as prompts that ask for +multiple stylistic constraints. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ☆ Are we going MAD? Benchmarking Multi-Agent Debate between Language + Models for Medical Q&A NeurIPS + + +
+ Recent advancements in large language models (LLMs) underscore their +potential for responding to medical inquiries. However, ensuring that +generative agents provide accurate and reliable answers remains an ongoing +challenge. In this context, multi-agent debate (MAD) has emerged as a prominent +strategy for enhancing the truthfulness of LLMs. In this work, we provide a +comprehensive benchmark of MAD strategies for medical Q&A, along with +open-source implementations. This explores the effective utilization of various +strategies including the trade-offs between cost, time, and accuracy. We build +upon these insights to provide a novel debate-prompting strategy based on agent +agreement that outperforms previously published strategies on medical Q&A +tasks. + +
+
+ comment: 16 pages, 6 figures, NeurIPS DGM4H Workshop 2023 +
+
+
+
+
+ + ☆ Are Large Language Models Good Fact Checkers: A Preliminary Study + + +
+ Recently, Large Language Models (LLMs) have drawn significant attention due +to their outstanding reasoning capabilities and extensive knowledge repository, +positioning them as superior in handling various natural language processing +tasks compared to other language models. In this paper, we present a +preliminary investigation into the potential of LLMs in fact-checking. This +study aims to comprehensively evaluate various LLMs in tackling specific +fact-checking subtasks, systematically evaluating their capabilities, and +conducting a comparative analysis of their performance against pre-trained and +state-of-the-art low-parameter models. Experiments demonstrate that LLMs +achieve competitive performance compared to other small models in most +scenarios. However, they encounter challenges in effectively handling Chinese +fact verification and the entirety of the fact-checking pipeline due to +language inconsistencies and hallucinations. These findings underscore the need +for further exploration and research to enhance the proficiency of LLMs as +reliable fact-checkers, unveiling the potential capability of LLMs and the +possible challenges in fact-checking tasks. + +
+
+
+
+
+ + ☆ Efficient Stitchable Task Adaptation + + +
+ The paradigm of pre-training and fine-tuning has laid the foundation for +deploying deep learning models. However, most fine-tuning methods are designed +to meet a specific resource budget. Recently, considering diverse deployment +scenarios with various resource budgets, stitchable neural network (SN-Net) is +introduced to quickly obtain numerous new networks (stitches) from the +pre-trained models (anchors) in a model family via model stitching. Although +promising, SN-Net confronts new challenges when adapting it to new target +domains, including huge memory and storage requirements and a long and +sub-optimal multistage adaptation process. In this work, we present a novel +framework, Efficient Stitchable Task Adaptation (ESTA), to efficiently produce +a palette of fine-tuned models that adhere to diverse resource constraints. +Specifically, we first tailor parameter-efficient fine-tuning to share low-rank +updates among the stitches while maintaining independent bias terms. In this +way, we largely reduce fine-tuning memory burdens and mitigate the interference +among stitches that arises in task adaptation. Furthermore, we streamline a +simple yet effective one-stage deployment pipeline, which estimates the +important stitches to deploy with training-time gradient statistics. By +assigning higher sampling probabilities to important stitches, we also get a +boosted Pareto frontier. Extensive experiments on 25 downstream visual +recognition tasks demonstrate that our ESTA is capable of generating stitches +with smooth accuracy-efficiency trade-offs and surpasses the direct SN-Net +adaptation by remarkable margins with significantly lower training time and +fewer trainable parameters. Furthermore, we demonstrate the flexibility and +scalability of our ESTA framework by stitching LLMs from LLaMA family, +obtaining chatbot stitches of assorted sizes. + +
+
+ comment: Source code will be released at + https://github.com/ziplab/Stitched_LLaMA +
+
+
+
+
+ + ☆ Exploring Large Language Models for Human Mobility Prediction under + Public Events + + +
+ Public events, such as concerts and sports games, can be major attractors for +large crowds, leading to irregular surges in travel demand. Accurate human +mobility prediction for public events is thus crucial for event planning as +well as traffic or crowd management. While rich textual descriptions about +public events are commonly available from online sources, it is challenging to +encode such information in statistical or machine learning models. Existing +methods are generally limited in incorporating textual information, handling +data sparsity, or providing rationales for their predictions. To address these +challenges, we introduce a framework for human mobility prediction under public +events (LLM-MPE) based on Large Language Models (LLMs), leveraging their +unprecedented ability to process textual data, learn from minimal examples, and +generate human-readable explanations. Specifically, LLM-MPE first transforms +raw, unstructured event descriptions from online sources into a standardized +format, and then segments historical mobility data into regular and +event-related components. A prompting strategy is designed to direct LLMs in +making and rationalizing demand predictions considering historical mobility and +event features. A case study is conducted for Barclays Center in New York City, +based on publicly available event information and taxi trip data. Results show +that LLM-MPE surpasses traditional models, particularly on event days, with +textual data significantly enhancing its accuracy. Furthermore, LLM-MPE offers +interpretable insights into its predictions. Despite the great potential of +LLMs, we also identify key challenges including misinformation and high costs +that remain barriers to their broader adoption in large-scale human mobility +analysis. + +
+
+
+
+
+ + ☆ Biomedical knowledge graph-enhanced prompt generation for large language + models + + +
+ Large Language Models (LLMs) have been driving progress in AI at an +unprecedented rate, yet still face challenges in knowledge-intensive domains +like biomedicine. Solutions such as pre-training and domain-specific +fine-tuning add substantial computational overhead, and the latter require +domain-expertise. External knowledge infusion is task-specific and requires +model training. Here, we introduce a task-agnostic Knowledge Graph-based +Retrieval Augmented Generation (KG-RAG) framework by leveraging the massive +biomedical KG SPOKE with LLMs such as Llama-2-13b, GPT-3.5-Turbo and GPT-4, to +generate meaningful biomedical text rooted in established knowledge. KG-RAG +consistently enhanced the performance of LLMs across various prompt types, +including one-hop and two-hop prompts, drug repurposing queries, biomedical +true/false questions, and multiple-choice questions (MCQ). Notably, KG-RAG +provides a remarkable 71% boost in the performance of the Llama-2 model on the +challenging MCQ dataset, demonstrating the framework's capacity to empower +open-source models with fewer parameters for domain-specific questions. +Furthermore, KG-RAG enhanced the performance of proprietary GPT models, such as +GPT-3.5 which exhibited improvement over GPT-4 in context utilization on MCQ +data. Our approach was also able to address drug repurposing questions, +returning meaningful repurposing suggestions. In summary, the proposed +framework combines explicit and implicit knowledge of KG and LLM, respectively, +in an optimized fashion, thus enhancing the adaptability of general-purpose +LLMs to tackle domain-specific questions in a unified framework. + +
+
+ comment: 28 pages, 5 figures, 2 tables, 1 supplementary file +
+
+
+
+
+ + ☆ Universal Self-Consistency for Large Language Model Generation + + +
+ Self-consistency with chain-of-thought prompting (CoT) has demonstrated +remarkable performance gains on various challenging tasks, by utilizing +multiple reasoning paths sampled from large language models (LLMs). However, +self-consistency relies on the answer extraction process to aggregate multiple +solutions, which is not applicable to free-form answers. In this work, we +propose Universal Self-Consistency (USC), which leverages LLMs themselves to +select the most consistent answer among multiple candidates. We evaluate USC on +a variety of benchmarks, including mathematical reasoning, code generation, +long-context summarization, and open-ended question answering. On open-ended +generation tasks where the original self-consistency method is not applicable, +USC effectively utilizes multiple samples and improves the performance. For +mathematical reasoning, USC matches the standard self-consistency performance +without requiring the answer formats to be similar. Finally, without access to +execution results, USC also matches the execution-based voting performance on +code generation. + +
+
+
+
+
+ + ☆ RoKEPG: RoBERTa and Knowledge Enhancement for Prescription Generation of + Traditional Chinese Medicine + + +
+ Traditional Chinese medicine (TCM) prescription is the most critical form of +TCM treatment, and uncovering the complex nonlinear relationship between +symptoms and TCM is of great significance for clinical practice and assisting +physicians in diagnosis and treatment. Although there have been some studies on +TCM prescription generation, these studies consider a single factor and +directly model the symptom-prescription generation problem mainly based on +symptom descriptions, lacking guidance from TCM knowledge. To this end, we +propose a RoBERTa and Knowledge Enhancement model for Prescription Generation +of Traditional Chinese Medicine (RoKEPG). RoKEPG is firstly pre-trained by our +constructed TCM corpus, followed by fine-tuning the pre-trained model, and the +model is guided to generate TCM prescriptions by introducing four classes of +knowledge of TCM through the attention mask matrix. Experimental results on the +publicly available TCM prescription dataset show that RoKEPG improves the F1 +metric by about 2% over the baseline model with the best results. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Language Models: A Guide for the Perplexed + + +
+ Given the growing importance of AI literacy, we decided to write this +tutorial to help narrow the gap between the discourse among those who study +language models -- the core technology underlying ChatGPT and similar products +-- and those who are intrigued and want to learn more about them. In short, we +believe the perspective of researchers and educators can add some clarity to +the public's understanding of the technologies beyond what's currently +available, which tends to be either extremely technical or promotional material +generated about products by their purveyors. + Our approach teases apart the concept of a language model from products built +on them, from the behaviors attributed to or desired from those products, and +from claims about similarity to human cognition. As a starting point, we (1) +offer a scientific viewpoint that focuses on questions amenable to study +through experimentation; (2) situate language models as they are today in the +context of the research that led to their development; and (3) describe the +boundaries of what is known about the models at this writing. + +
+
+
+
+
+ + ☆ Elo Uncovered: Robustness and Best Practices in Language Model + Evaluation EMNLP 2023 + + +
+ In Natural Language Processing (NLP), the Elo rating system, originally +designed for ranking players in dynamic games such as chess, is increasingly +being used to evaluate Large Language Models (LLMs) through "A vs B" paired +comparisons. However, while popular, the system's suitability for assessing +entities with constant skill levels, such as LLMs, remains relatively +unexplored. We study two fundamental axioms that evaluation methods should +adhere to: reliability and transitivity. We conduct extensive evaluation of Elo +behaviour, illustrating that individual Elo computations exhibit volatility and +delving into the impact of varying the Elo rating system's hyperparameters. We +show that these axioms are not always satisfied raising questions about the +reliability of current comparative evaluations of LLMs. If the current use of +Elo scores is intended to substitute the costly head-to-head comparison of +LLMs, it is crucial to ensure the ranking is as robust as possible. Guided by +the axioms, our findings offer concrete guidelines for enhancing the +reliability of LLM evaluation methods, suggesting a need for reassessment of +existing comparative approaches. + +
+
+ comment: 22 pages, 7 figures, 2 tables. Revised version of the paper accepted + at GEM Workshop, EMNLP 2023 +
+
+
+
+
+ + ☆ Uncertainty Guided Global Memory Improves Multi-Hop Question Answering EMNLP 2023 + + +
+ Transformers have become the gold standard for many natural language +processing tasks and, in particular, for multi-hop question answering (MHQA). +This task includes processing a long document and reasoning over the multiple +parts of it. The landscape of MHQA approaches can be classified into two +primary categories. The first group focuses on extracting supporting evidence, +thereby constraining the QA model's context to predicted facts. Conversely, the +second group relies on the attention mechanism of the long input encoding model +to facilitate multi-hop reasoning. However, attention-based token +representations lack explicit global contextual information to connect +reasoning steps. To address these issues, we propose GEMFormer, a two-stage +method that first collects relevant information over the entire document to the +memory and then combines it with local context to solve the task. Our +experimental results show that fine-tuning a pre-trained model with +memory-augmented input, including the most certain global elements, improves +the model's performance on three MHQA datasets compared to the baseline. We +also found that the global explicit memory contains information from supporting +facts required for the correct answer. + +
+
+ comment: 12 pages, 7 figures. EMNLP 2023. Our code is available at + https://github.com/Aloriosa/GEMFormer +
+
+
+
+
+ + ☆ DisCGen: A Framework for Discourse-Informed Counterspeech Generation AACL + + +
+ Counterspeech can be an effective method for battling hateful content on +social media. Automated counterspeech generation can aid in this process. +Generated counterspeech, however, can be viable only when grounded in the +context of topic, audience and sensitivity as these factors influence both the +efficacy and appropriateness. In this work, we propose a novel framework based +on theories of discourse to study the inferential links that connect counter +speeches to the hateful comment. Within this framework, we propose: i) a +taxonomy of counterspeech derived from discourse frameworks, and ii) +discourse-informed prompting strategies for generating contextually-grounded +counterspeech. To construct and validate this framework, we present a process +for collecting an in-the-wild dataset of counterspeech from Reddit. Using this +process, we manually annotate a dataset of 3.9k Reddit comment pairs for the +presence of hatespeech and counterspeech. The positive pairs are annotated for +10 classes in our proposed taxonomy. We annotate these pairs with paraphrased +counterparts to remove offensiveness and first-person references. We show that +by using our dataset and framework, large language models can generate +contextually-grounded counterspeech informed by theories of discourse. +According to our human evaluation, our approaches can act as a safeguard +against critical failures of discourse-agnostic models. + +
+
+ comment: IJCNLP-AACL, 2023 +
+
+
+
+
+ + ☆ ROBBIE: Robust Bias Evaluation of Large Generative Language Models EMNLP 2023 + + +
+ As generative large language models (LLMs) grow more performant and +prevalent, we must develop comprehensive enough tools to measure and improve +their fairness. Different prompt-based datasets can be used to measure social +bias across multiple text domains and demographic axes, meaning that testing +LLMs on more datasets can potentially help us characterize their biases more +fully, and better ensure equal and equitable treatment of marginalized +demographic groups. In this work, our focus is two-fold: + (1) Benchmarking: a comparison of 6 different prompt-based bias and toxicity +metrics across 12 demographic axes and 5 families of generative LLMs. Out of +those 6 metrics, AdvPromptSet and HolisticBiasR are novel datasets proposed in +the paper. The comparison of those benchmarks gives us insights about the bias +and toxicity of the compared models. Therefore, we explore the frequency of +demographic terms in common LLM pre-training corpora and how this may relate to +model biases. + (2) Mitigation: we conduct a comprehensive study of how well 3 bias/toxicity +mitigation techniques perform across our suite of measurements. ROBBIE aims to +provide insights for practitioners while deploying a model, emphasizing the +need to not only measure potential harms, but also understand how they arise by +characterizing the data, mitigate harms once found, and balance any trade-offs. +We open-source our analysis code in hopes of encouraging broader measurements +of bias in future LLMs. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ☆ TurkishBERTweet: Fast and Reliable Large Language Model for Social Media + Analysis + + +
+ Turkish is one of the most popular languages in the world. Wide us of this +language on social media platforms such as Twitter, Instagram, or Tiktok and +strategic position of the country in the world politics makes it appealing for +the social network researchers and industry. To address this need, we introduce +TurkishBERTweet, the first large scale pre-trained language model for Turkish +social media built using almost 900 million tweets. The model shares the same +architecture as base BERT model with smaller input length, making +TurkishBERTweet lighter than BERTurk and can have significantly lower inference +time. We trained our model using the same approach for RoBERTa model and +evaluated on two text classification tasks: Sentiment Classification and Hate +Speech Detection. We demonstrate that TurkishBERTweet outperforms the other +available alternatives on generalizability and its lower inference time gives +significant advantage to process large-scale datasets. We also compared our +models with the commercial OpenAI solutions in terms of cost and performance to +demonstrate TurkishBERTweet is scalable and cost-effective solution. As part of +our research, we released TurkishBERTweet and fine-tuned LoRA adapters for the +mentioned tasks under the MIT License to facilitate future research and +applications on Turkish social media. Our TurkishBERTweet model is available +at: https://github.com/ViralLab/TurkishBERTweet + +
+
+ comment: 21 pages, 4 figures, 8 tables +
+
+
+
+
+ + ☆ I Know You Did Not Write That! A Sampling Based Watermarking Method for + Identifying Machine Generated Text + + +
+ Potential harms of Large Language Models such as mass misinformation and +plagiarism can be partially mitigated if there exists a reliable way to detect +machine generated text. In this paper, we propose a new watermarking method to +detect machine-generated texts. Our method embeds a unique pattern within the +generated text, ensuring that while the content remains coherent and natural to +human readers, it carries distinct markers that can be identified +algorithmically. Specifically, we intervene with the token sampling process in +a way which enables us to trace back our token choices during the detection +phase. We show how watermarking affects textual quality and compare our +proposed method with a state-of-the-art watermarking method in terms of +robustness and detectability. Through extensive experiments, we demonstrate the +effectiveness of our watermarking scheme in distinguishing between watermarked +and non-watermarked text, achieving high detection rates while maintaining +textual quality. + +
+
+
+
+
+ + ☆ Zero-shot Conversational Summarization Evaluations with small Large + Language Models + + +
+ Large Language Models (LLMs) exhibit powerful summarization abilities. +However, their capabilities on conversational summarization remains under +explored. In this work we evaluate LLMs (approx. 10 billion parameters) on +conversational summarization and showcase their performance on various prompts. +We show that the summaries generated by models depend on the instructions and +the performance of LLMs vary with different instructions sometimes resulting +steep drop in ROUGE scores if prompts are not selected carefully. We also +evaluate the models with human evaluations and discuss the limitations of the +models on conversational summarization + +
+
+ comment: Accepted at RoF0Mo workshop at Neurips 2023 +
+
+
+
+
+ + ☆ Hyperpolyglot LLMs: Cross-Lingual Interpretability in Token Embeddings + + +
+ Cross-lingual transfer learning is an important property of multilingual +large language models (LLMs). But how do LLMs represent relationships between +languages? Every language model has an input layer that maps tokens to vectors. +This ubiquitous layer of language models is often overlooked. We find that +similarities between these input embeddings are highly interpretable and that +the geometry of these embeddings differs between model families. In one case +(XLM-RoBERTa), embeddings encode language: tokens in different writing systems +can be linearly separated with an average of 99.2% accuracy. Another family +(mT5) represents cross-lingual semantic similarity: the 50 nearest neighbors +for any token represent an average of 7.61 writing systems, and are frequently +translations. This result is surprising given that there is no explicit +parallel cross-lingual training corpora and no explicit incentive for +translations in pre-training objectives. Our research opens the door for +investigations in 1) The effect of pre-training and model architectures on +representations of languages and 2) The applications of cross-lingual +representations embedded in language models. + +
+
+
+
+
+ + ☆ Filtered Semi-Markov CRF EMNLP 2023 + + +
+ Semi-Markov CRF has been proposed as an alternative to the traditional Linear +Chain CRF for text segmentation tasks such as Named Entity Recognition (NER). +Unlike CRF, which treats text segmentation as token-level prediction, Semi-CRF +considers segments as the basic unit, making it more expressive. However, +Semi-CRF suffers from two major drawbacks: (1) quadratic complexity over +sequence length, as it operates on every span of the input sequence, and (2) +inferior performance compared to CRF for sequence labeling tasks like NER. In +this paper, we introduce Filtered Semi-Markov CRF, a variant of Semi-CRF that +addresses these issues by incorporating a filtering step to eliminate +irrelevant segments, reducing complexity and search space. Our approach is +evaluated on several NER benchmarks, where it outperforms both CRF and Semi-CRF +while being significantly faster. The implementation of our method is available +on \href{https://github.com/urchade/Filtered-Semi-Markov-CRF}{Github}. + +
+
+ comment: EMNLP 2023 (Findings) +
+
+
+
+
+ + ☆ Self-Infilling Code Generation + + +
+ This work introduces a general code generation framework that incorporates +infilling operations into auto-regressive decoding. Our approach capitalizes on +the observation that recent code language models with infilling capabilities +can perform \emph{self-infilling}: whereas infilling operations aim to fill in +the middle based on a predefined prefix and suffix, self-infilling sequentially +generates both such surrounding context and the infilled content. We utilize +this feature to develop an infilling-augmented decoding process that +facilitates non-monotonic generation. This approach allows for postponing the +generation of uncertain code snippets until a definitive suffix is established, +leading to improved control over the generation sequence. In addition, it +facilitates a looping mechanism, which can iteratively update and synchronize +each piece of generation in a cyclic manner. Extensive experiments are +conducted to demonstrate that our proposed decoding process is effective in +enhancing regularity and quality across several code generation benchmarks. + +
+
+
+
+
+ + ☆ DreamSync: Aligning Text-to-Image Generation with Image Understanding + Feedback + + +
+ Despite their wide-spread success, Text-to-Image models (T2I) still struggle +to produce images that are both aesthetically pleasing and faithful to the +user's input text. We introduce DreamSync, a model-agnostic training algorithm +by design that improves T2I models to be faithful to the text input. DreamSync +builds off a recent insight from TIFA's evaluation framework -- that large +vision-language models (VLMs) can effectively identify the fine-grained +discrepancies between generated images and the text inputs. DreamSync uses this +insight to train T2I models without any labeled data; it improves T2I models +using its own generations. First, it prompts the model to generate several +candidate images for a given input text. Then, it uses two VLMs to select the +best generation: a Visual Question Answering model that measures the alignment +of generated images to the text, and another that measures the generation's +aesthetic quality. After selection, we use LoRA to iteratively finetune the T2I +model to guide its generation towards the selected best generations. DreamSync +does not need any additional human annotation. model architecture changes, or +reinforcement learning. Despite its simplicity, DreamSync improves both the +semantic alignment and aesthetic appeal of two diffusion-based T2I models, +evidenced by multiple benchmarks (+1.7% on TIFA, +2.9% on DSG1K, +3.4% on VILA +aesthetic) and human evaluation. + +
+
+
+
+
+ + ♻ ☆ Chameleon: a heterogeneous and disaggregated accelerator system for + retrieval-augmented language models + + +
+ A Retrieval-Augmented Language Model (RALM) augments a generative language +model by retrieving context-specific knowledge from an external database. This +strategy facilitates impressive text generation quality even with smaller +models, thus reducing orders of magnitude of computational demands. However, +RALMs introduce unique system design challenges due to (a) the diverse workload +characteristics between LM inference and retrieval and (b) the various system +requirements and bottlenecks for different RALM configurations such as model +sizes, database sizes, and retrieval frequencies. We propose Chameleon, a +heterogeneous accelerator system that integrates both LM and retrieval +accelerators in a disaggregated architecture. The heterogeneity ensures +efficient acceleration of both LM inference and retrieval, while the +accelerator disaggregation enables the system to independently scale both types +of accelerators to fulfill diverse RALM requirements. Our Chameleon prototype +implements retrieval accelerators on FPGAs and assigns LM inference to GPUs, +with a CPU server orchestrating these accelerators over the network. Compared +to CPU-based and CPU-GPU vector search systems, Chameleon achieves up to 23.72x +speedup and 26.2x energy efficiency. Evaluated on various RALMs, Chameleon +exhibits up to 2.16x reduction in latency and 3.18x speedup in throughput +compared to the hybrid CPU-GPU architecture. These promising results pave the +way for bringing accelerator heterogeneity and disaggregation into future RALM +systems. + +
+
+
+
+
+ + ♻ ☆ LM-Cocktail: Resilient Tuning of Language Models via Model Merging + + +
+ The pre-trained language models are continually fine-tuned to better support +downstream applications. However, this operation may result in significant +performance degeneration on general tasks beyond the targeted domain. To +overcome this problem, we propose LM-Cocktail which enables the fine-tuned +model to stay resilient in general perspectives. Our method is conducted in the +form of model merging, where the fine-tuned language model is merged with the +pre-trained base model or the peer models from other domains through weighted +average. Despite simplicity, LM-Cocktail is surprisingly effective: the +resulted model is able to achieve a strong empirical performance in the whole +scope of general tasks while preserving a superior capacity in its targeted +domain. We conduct comprehensive experiments with LLama and BGE model on +popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the +efficacy of our proposed method. The code and checkpoints are available at +https://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail. + +
+
+
+
+
+ + ♻ ☆ ChatGPT's One-year Anniversary: Are Open-Source Large Language Models + Catching up? + + +
+ Upon its release in late 2022, ChatGPT has brought a seismic shift in the +entire landscape of AI, both in research and commerce. Through +instruction-tuning a large language model (LLM) with supervised fine-tuning and +reinforcement learning from human feedback, it showed that a model could answer +human questions and follow instructions on a broad panel of tasks. Following +this success, interests in LLMs have intensified, with new LLMs flourishing at +frequent interval across academia and industry, including many start-ups +focused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's +Claude) generally outperform their open-source counterparts, the progress on +the latter has been rapid with claims of achieving parity or even better on +certain tasks. This has crucial implications not only on research but also on +business. In this work, on the first anniversary of ChatGPT, we provide an +exhaustive overview of this success, surveying all tasks where an open-source +LLM has claimed to be on par or better than ChatGPT. + +
+
+ comment: version v2, applied several minor changes +
+
+
+
+
+ + ♻ ☆ Efficient In-Context Learning in Vision-Language Models for Egocentric + Videos + + +
+ Recent advancements in text-only large language models (LLMs) have +highlighted the benefit of in-context learning for adapting to new tasks with a +few demonstrations. However, extending in-context learning to large +vision-language models (VLMs) using a huge amount of naturalistic +vision-language data has shown limited success, particularly for egocentric +videos, due to high data collection costs. We propose a novel training method +$\mathbb{E}$fficient $\mathbb{I}$n-context $\mathbb{L}$earning on +$\mathbb{E}$gocentric $\mathbb{V}$ideos ($\mathbb{EILEV}$), which elicits +in-context learning in VLMs for egocentric videos without requiring massive, +naturalistic egocentric video datasets. $\mathbb{EILEV}$ involves architectural +and training data adaptations to allow the model to process contexts +interleaved with video clips and narrations, sampling of in-context examples +with clusters of similar verbs and nouns, use of data with skewed marginal +distributions with a long tail of infrequent verbs and nouns, as well as +homonyms and synonyms. Our evaluations show that $\mathbb{EILEV}$-trained +models outperform larger VLMs trained on a huge amount of naturalistic data in +in-context learning. Furthermore, they can generalize to not only +out-of-distribution, but also novel, rare egocentric videos and texts via +in-context learning, demonstrating potential for applications requiring +cost-effective training, and rapid post-deployment adaptability. Our code and +demo are available at \url{https://github.com/yukw777/EILEV}. + +
+
+ comment: 10 pages, LaTeX; added acknowledgments +
+
+
+
+
+ + ♻ ☆ An Attribution Method for Siamese Encoders EMNLP'23 + + +
+ Despite the success of Siamese encoder models such as sentence transformers +(ST), little is known about the aspects of inputs they pay attention to. A +barrier is that their predictions cannot be attributed to individual features, +as they compare two inputs rather than processing a single one. This paper +derives a local attribution method for Siamese encoders by generalizing the +principle of integrated gradients to models with multiple inputs. The solution +takes the form of feature-pair attributions, and can be reduced to a +token-token matrix for STs. Our method involves the introduction of integrated +Jacobians and inherits the advantageous formal properties of integrated +gradients: it accounts for the model's full computation graph and is guaranteed +to converge to the actual prediction. A pilot study shows that in an ST few +token-pairs can often explain large fractions of predictions, and it focuses on +nouns and verbs. For accurate predictions, it however needs to attend to the +majority of tokens and parts of speech. + +
+
+ comment: Accepted to EMNLP'23 +
+
+
+
+
+ + ♻ ☆ Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of + Text-To-Image Models + + +
+ Text-To-Image (TTI) models, such as DALL-E and StableDiffusion, have +demonstrated remarkable prompt-based image generation capabilities. +Multilingual encoders may have a substantial impact on the cultural agency of +these models, as language is a conduit of culture. In this study, we explore +the cultural perception embedded in TTI models by characterizing culture across +three hierarchical tiers: cultural dimensions, cultural domains, and cultural +concepts. Based on this ontology, we derive prompt templates to unlock the +cultural knowledge in TTI models, and propose a comprehensive suite of +evaluation techniques, including intrinsic evaluations using the CLIP space, +extrinsic evaluations with a Visual-Question-Answer (VQA) model and human +assessments, to evaluate the cultural content of TTI-generated images. To +bolster our research, we introduce the CulText2I dataset, derived from four +diverse TTI models and spanning ten languages. Our experiments provide insights +regarding Do, What, Which and How research questions about the nature of +cultural encoding in TTI models, paving the way for cross-cultural applications +of these models. + +
+
+
+
+
+ + ♻ ☆ Loose lips sink ships: Mitigating Length Bias in Reinforcement Learning + from Human Feedback EMNLP 2023 + + +
+ Reinforcement learning from human feedback serves as a crucial bridge, +aligning large language models with human and societal values. This alignment +requires a vast corpus of human feedback to learn a reward model, which is +subsequently used to finetune language models. However, we have identified that +the reward model often finds shortcuts to bypass its intended objectives, +misleadingly assuming that humans prefer longer responses. The emergence of +length bias often induces the model to favor longer outputs, yet it doesn't +equate to an increase in helpful information within these outputs. In this +paper, we propose an innovative solution, applying the Product-of-Experts (PoE) +technique to separate reward modeling from the influence of sequence length. In +our framework, the main expert concentrates on understanding human intents, +while the biased expert targets the identification and capture of length bias. +To further enhance the learning of bias, we introduce perturbations into the +bias-focused expert, disrupting the flow of semantic information. Experimental +results validate the effectiveness of our approach, indicating that language +model performance is improved, irrespective of sequence length. + +
+
+ comment: EMNLP 2023 findings, Length Bias in RLHF, Mitigate bias in reward + modeling +
+
+
+
+
+ + ♻ ☆ Adapting Sentence Transformers for the Aviation Domain + + +
+ Learning effective sentence representations is crucial for many Natural +Language Processing (NLP) tasks, including semantic search, semantic textual +similarity (STS), and clustering. While multiple transformer models have been +developed for sentence embedding learning, these models may not perform +optimally when dealing with specialized domains like aviation, which has unique +characteristics such as technical jargon, abbreviations, and unconventional +grammar. Furthermore, the absence of labeled datasets makes it difficult to +train models specifically for the aviation domain. To address these challenges, +we propose a novel approach for adapting sentence transformers for the aviation +domain. Our method is a two-stage process consisting of pre-training followed +by fine-tuning. During pre-training, we use Transformers and Sequential +Denoising AutoEncoder (TSDAE) with aviation text data as input to improve the +initial model performance. Subsequently, we fine-tune our models using a +Natural Language Inference (NLI) dataset in the Sentence Bidirectional Encoder +Representations from Transformers (SBERT) architecture to mitigate overfitting +issues. Experimental results on several downstream tasks show that our adapted +sentence transformers significantly outperform general-purpose transformers, +demonstrating the effectiveness of our approach in capturing the nuances of the +aviation domain. Overall, our work highlights the importance of domain-specific +adaptation in developing high-quality NLP solutions for specialized industries +like aviation. + +
+
+
+
+
+ + ♻ ☆ Exploring Human-Like Translation Strategy with Large Language Models ACL + + +
+ Large language models (LLMs) have demonstrated impressive capabilities in +general scenarios, exhibiting a level of aptitude that approaches, in some +aspects even surpasses, human-level intelligence. Among their numerous skills, +the translation abilities of LLMs have received considerable attention. +Compared to typical machine translation that focuses solely on source-to-target +mapping, LLM-based translation can potentially mimic the human translation +process which might take preparatory steps to ensure high-quality translation. +This work explores this possibility by proposing the MAPS framework, which +stands for Multi-Aspect Prompting and Selection. Specifically, we enable LLMs +first to analyze the given source sentence and induce three aspects of +translation-related knowledge: keywords, topics, and relevant demonstrations to +guide the final translation process. Moreover, we employ a selection mechanism +based on quality estimation to filter out noisy and unhelpful knowledge. Both +automatic (3 LLMs x 11 directions x 2 automatic metrics) and human evaluation +(preference study and MQM) demonstrate the effectiveness of MAPS. Further +analysis shows that by mimicking the human translation process, MAPS reduces +various translation errors such as hallucination, ambiguity, mistranslation, +awkward style, untranslated text, and omission. Source code is available at +https://github.com/zwhe99/MAPS-mt. + +
+
+ comment: To be published in TACL (pre-MIT Press publication version) +
+
+
+
+
+ + ♻ ☆ Arabic Sentiment Analysis with Noisy Deep Explainable Model + + +
+ Sentiment Analysis (SA) is an indispensable task for many real-world +applications. Compared to limited resourced languages (i.e., Arabic, Bengali), +most of the research on SA are conducted for high resourced languages (i.e., +English, Chinese). Moreover, the reasons behind any prediction of the Arabic +sentiment analysis methods exploiting advanced artificial intelligence +(AI)-based approaches are like black-box - quite difficult to understand. This +paper proposes an explainable sentiment classification framework for the Arabic +language by introducing a noise layer on Bi-Directional Long Short-Term Memory +(BiLSTM) and Convolutional Neural Networks (CNN)-BiLSTM models that overcome +over-fitting problem. The proposed framework can explain specific predictions +by training a local surrogate explainable model to understand why a particular +sentiment (positive or negative) is being predicted. We carried out experiments +on public benchmark Arabic SA datasets. The results concluded that adding noise +layers improves the performance in sentiment analysis for the Arabic language +by reducing overfitting and our method outperformed some known state-of-the-art +methods. In addition, the introduced explainability with noise layer could make +the model more transparent and accountable and hence help adopting AI-enabled +system in practice. + +
+
+ comment: This is the pre-print version of our accepted paper at the 7th + International Conference on Natural Language Processing and Information + Retrieval~(ACM NLPIR'2023) +
+
+
+
+
+ + ♻ ☆ MuLER: Detailed and Scalable Reference-based Evaluation + + +
+ We propose a novel methodology (namely, MuLER) that transforms any +reference-based evaluation metric for text generation, such as machine +translation (MT) into a fine-grained analysis tool. Given a system and a +metric, MuLER quantifies how much the chosen metric penalizes specific error +types (e.g., errors in translating names of locations). MuLER thus enables a +detailed error analysis which can lead to targeted improvement efforts for +specific phenomena. We perform experiments in both synthetic and naturalistic +settings to support MuLER's validity and showcase its usability in MT +evaluation, and other tasks, such as summarization. Analyzing all submissions +to WMT in 2014-2020, we find consistent trends. For example, nouns and verbs +are among the most frequent POS tags. However, they are among the hardest to +translate. Performance on most POS tags improves with overall system +performance, but a few are not thus correlated (their identity changes from +language to language). Preliminary experiments with summarization reveal +similar trends. + +
+
+
+
+
+ + ♻ ☆ Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as + an Alternative to Attention Layers in Transformers AAAI24 + + +
+ This work presents an analysis of the effectiveness of using standard shallow +feed-forward networks to mimic the behavior of the attention mechanism in the +original Transformer model, a state-of-the-art architecture for +sequence-to-sequence tasks. We substitute key elements of the attention +mechanism in the Transformer with simple feed-forward networks, trained using +the original components via knowledge distillation. Our experiments, conducted +on the IWSLT2017 dataset, reveal the capacity of these "attentionless +Transformers" to rival the performance of the original architecture. Through +rigorous ablation studies, and experimenting with various replacement network +types and sizes, we offer insights that support the viability of our approach. +This not only sheds light on the adaptability of shallow feed-forward networks +in emulating attention mechanisms but also underscores their potential to +streamline complex architectures for sequence-to-sequence tasks. + +
+
+ comment: Accepted at AAAI24(https://aaai.org/aaai-conference/) +
+
+
+
+
+ + ♻ ☆ SentMix-3L: A Bangla-English-Hindi Code-Mixed Dataset for Sentiment + Analysis + + +
+ Code-mixing is a well-studied linguistic phenomenon when two or more +languages are mixed in text or speech. Several datasets have been build with +the goal of training computational models for code-mixing. Although it is very +common to observe code-mixing with multiple languages, most datasets available +contain code-mixed between only two languages. In this paper, we introduce +SentMix-3L, a novel dataset for sentiment analysis containing code-mixed data +between three languages Bangla, English, and Hindi. We carry out a +comprehensive evaluation using SentMix-3L. We show that zero-shot prompting +with GPT-3.5 outperforms all transformer-based models on SentMix-3L. + +
+
+
+
+
+ + ♻ ☆ Diffusion Glancing Transformer for Parallel Sequence to Sequence + Learning + + +
+ Previously, non-autoregressive models were widely perceived as being superior +in generation efficiency but inferior in generation quality due to the +difficulties of modeling multiple target modalities. To enhance the +multi-modality modeling ability, we propose the diffusion glancing transformer, +which employs a modality diffusion process and residual glancing sampling. The +modality diffusion process is a discrete process that interpolates the +multi-modal distribution along the decoding steps, and the residual glancing +sampling approach guides the model to continuously learn the remaining +modalities across the layers. Experimental results on various machine +translation and text generation benchmarks demonstrate that DIFFGLAT achieves +better generation accuracy while maintaining fast decoding speed compared with +both autoregressive and non-autoregressive models. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with + Large Language Models ACM MM 2023 + + +
+ Diffusion models, which have emerged to become popular text-to-image +generation models, can produce high-quality and content-rich images guided by +textual prompts. However, there are limitations to semantic understanding and +commonsense reasoning in existing models when the input prompts are concise +narrative, resulting in low-quality image generation. To improve the capacities +for narrative prompts, we propose a simple-yet-effective parameter-efficient +fine-tuning approach called the Semantic Understanding and Reasoning adapter +(SUR-adapter) for pre-trained diffusion models. To reach this goal, we first +collect and annotate a new dataset SURD which consists of more than 57,000 +semantically corrected multi-modal samples. Each sample contains a simple +narrative prompt, a complex keyword-based prompt, and a high-quality image. +Then, we align the semantic representation of narrative prompts to the complex +prompts and transfer knowledge of large language models (LLMs) to our +SUR-adapter via knowledge distillation so that it can acquire the powerful +semantic understanding and reasoning capabilities to build a high-quality +textual semantic representation for text-to-image generation. We conduct +experiments by integrating multiple LLMs and popular pre-trained diffusion +models to show the effectiveness of our approach in enabling diffusion models +to understand and reason concise natural language without image quality +degradation. Our approach can make text-to-image diffusion models easier to use +with better user experience, which demonstrates our approach has the potential +for further advancing the development of user-friendly text-to-image generation +models by bridging the semantic gap between simple narrative prompts and +complex keyword-based prompts. The code is released at +https://github.com/Qrange-group/SUR-adapter. + +
+
+ comment: accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Multi-turn Response Selection using Dialogue Dependency Relations EMNLP2020 + + +
+ Multi-turn response selection is a task designed for developing dialogue +agents. The performance on this task has a remarkable improvement with +pre-trained language models. However, these models simply concatenate the turns +in dialogue history as the input and largely ignore the dependencies between +the turns. In this paper, we propose a dialogue extraction algorithm to +transform a dialogue history into threads based on their dependency relations. +Each thread can be regarded as a self-contained sub-dialogue. We also propose +Thread-Encoder model to encode threads and candidates into compact +representations by pre-trained Transformers and finally get the matching score +through an attention layer. The experiments show that dependency relations are +helpful for dialogue context understanding, and our model outperforms the +state-of-the-art baselines on both DSTC7 and DSTC8*, with competitive results +on UbuntuV2. + +
+
+ comment: Accepted for publication as a long paper in EMNLP2020 +
+
+
+
+
+ + ♻ ☆ DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via + Multi-Modal Causal Attention + + +
+ Most of the existing multi-modal models, hindered by their incapacity to +adeptly manage interleaved image-and-text inputs in multi-image, multi-round +dialogues, face substantial constraints in resource allocation for training and +data accessibility, impacting their adaptability and scalability across varied +interaction realms. To address this, we present the DeepSpeed-VisualChat +framework, designed to optimize Large Language Models (LLMs) by incorporating +multi-modal capabilities, with a focus on enhancing the proficiency of Large +Vision and Language Models in handling interleaved inputs. Our framework is +notable for (1) its open-source support for multi-round and multi-image +dialogues, (2) introducing an innovative multi-modal causal attention +mechanism, and (3) utilizing data blending techniques on existing datasets to +assure seamless interactions in multi-round, multi-image conversations. +Compared to existing frameworks, DeepSpeed-VisualChat shows superior +scalability up to 70B parameter language model size, representing a significant +advancement in multi-modal language models and setting a solid foundation for +future explorations. + +
+
+
+
+
+ + ♻ ☆ Exo2EgoDVC: Dense Video Captioning of Egocentric Procedural Activities + Using Web Instructional Videos + + +
+ We propose a novel benchmark for cross-view knowledge transfer of dense video +captioning, adapting models from web instructional videos with exocentric views +to an egocentric view. While dense video captioning (predicting time segments +and their captions) is primarily studied with exocentric videos (e.g., +YouCook2), benchmarks with egocentric videos are restricted due to data +scarcity. To overcome the limited video availability, transferring knowledge +from abundant exocentric web videos is demanded as a practical approach. +However, learning the correspondence between exocentric and egocentric views is +difficult due to their dynamic view changes. The web videos contain mixed views +focusing on either human body actions or close-up hand-object interactions, +while the egocentric view is constantly shifting as the camera wearer moves. +This necessitates the in-depth study of cross-view transfer under complex view +changes. In this work, we first create a real-life egocentric dataset (EgoYC2) +whose captions are shared with YouCook2, enabling transfer learning between +these datasets assuming their ground-truth is accessible. To bridge the view +gaps, we propose a view-invariant learning method using adversarial training in +both the pre-training and fine-tuning stages. While the pre-training is +designed to learn invariant features against the mixed views in the web videos, +the view-invariant fine-tuning further mitigates the view gaps between both +datasets. We validate our proposed method by studying how effectively it +overcomes the view change problem and efficiently transfers the knowledge to +the egocentric domain. Our benchmark pushes the study of the cross-view +transfer into a new task domain of dense video captioning and will envision +methodologies to describe egocentric videos in natural language. + +
+
+
+
+
+ + ♻ ☆ Meaning Representations from Trajectories in Autoregressive Models + + +
+ We propose to extract meaning representations from autoregressive language +models by considering the distribution of all possible trajectories extending +an input text. This strategy is prompt-free, does not require fine-tuning, and +is applicable to any pre-trained autoregressive model. Moreover, unlike +vector-based representations, distribution-based representations can also model +asymmetric relations (e.g., direction of logical entailment, hypernym/hyponym +relations) by using algebraic operations between likelihood functions. These +ideas are grounded in distributional perspectives on semantics and are +connected to standard constructions in automata theory, but to our knowledge +they have not been applied to modern language models. We empirically show that +the representations obtained from large models align well with human +annotations, outperform other zero-shot and prompt-free methods on semantic +similarity tasks, and can be used to solve more complex entailment and +containment tasks that standard embeddings cannot handle. Finally, we extend +our method to represent data from different modalities (e.g., image and text) +using multimodal autoregressive models. Our code is available at: +https://github.com/tianyu139/meaning-as-trajectories + +
+
+
+
+
+ + ♻ ☆ ChatTraffic: Text-to-Traffic Generation via Diffusion Model + + +
+ Traffic prediction is one of the most significant foundations in Intelligent +Transportation Systems (ITS). Traditional traffic prediction methods rely only +on historical traffic data to predict traffic trends and face two main +challenges. 1) insensitivity to unusual events. 2) poor performance in +long-term prediction. In this work, we explore how generative models combined +with text describing the traffic system can be applied for traffic generation +and name the task Text-to-Traffic Generation (TTG). The key challenge of the +TTG task is how to associate text with the spatial structure of the road +network and traffic data for generating traffic situations. To this end, we +propose ChatTraffic, the first diffusion model for text-to-traffic generation. +To guarantee the consistency between synthetic and real data, we augment a +diffusion model with the Graph Convolutional Network (GCN) to extract spatial +correlations of traffic data. In addition, we construct a large dataset +containing text-traffic pairs for the TTG task. We benchmarked our model +qualitatively and quantitatively on the released dataset. The experimental +results indicate that ChatTraffic can generate realistic traffic situations +from the text. Our code and dataset are available at +https://github.com/ChyaZhang/ChatTraffic. + +
+
+
+
+
+ + ♻ ☆ KL-Divergence Guided Temperature Sampling + + +
+ Temperature sampling is a conventional approach to diversify large language +model predictions. As temperature increases, the prediction becomes diverse but +also vulnerable to hallucinations -- generating tokens that are sensible but +not factual. One common approach to mitigate hallucinations is to provide +source/grounding documents and the model is trained to produce predictions that +bind to and are attributable to the provided source. It appears that there is a +trade-off between diversity and attribution. To mitigate any such trade-off, we +propose to relax the constraint of having a fixed temperature over decoding +steps, and a mechanism to guide the dynamic temperature according to its +relevance to the source through KL-divergence. Our experiments justifies the +trade-off, and shows that our sampling algorithm outperforms the conventional +top-k and top-p algorithms in conversational question-answering and +summarization tasks. + +
+
+
+
+
+ + ♻ ☆ A Minimal Approach for Natural Language Action Space in Text-based Games + + +
+ Text-based games (TGs) are language-based interactive environments for +reinforcement learning. While language models (LMs) and knowledge graphs (KGs) +are commonly used for handling large action space in TGs, it is unclear whether +these techniques are necessary or overused. In this paper, we revisit the +challenge of exploring the action space in TGs and propose $ +\epsilon$-admissible exploration, a minimal approach of utilizing admissible +actions, for training phase. Additionally, we present a text-based actor-critic +(TAC) agent that produces textual commands for game, solely from game +observations, without requiring any KG or LM. Our method, on average across 10 +games from Jericho, outperforms strong baselines and state-of-the-art agents +that use LM and KG. Our approach highlights that a much lighter model design, +with a fresh perspective on utilizing the information within the environments, +suffices for an effective exploration of exponentially large action spaces. + +
+
+
+
+
+ + ♻ ☆ AART: AI-Assisted Red-Teaming with Diverse Data Generation for New + LLM-powered Applications + + +
+ Adversarial testing of large language models (LLMs) is crucial for their safe +and responsible deployment. We introduce a novel approach for automated +generation of adversarial evaluation datasets to test the safety of LLM +generations on new downstream applications. We call it AI-assisted Red-Teaming +(AART) - an automated alternative to current manual red-teaming efforts. AART +offers a data generation and augmentation pipeline of reusable and customizable +recipes that reduce human effort significantly and enable integration of +adversarial testing earlier in new product development. AART generates +evaluation datasets with high diversity of content characteristics critical for +effective adversarial testing (e.g. sensitive and harmful concepts, specific to +a wide range of cultural and geographic regions and application scenarios). The +data generation is steered by AI-assisted recipes to define, scope and +prioritize diversity within the application context. This feeds into a +structured LLM-generation process that scales up evaluation priorities. +Compared to some state-of-the-art tools, AART shows promising results in terms +of concept coverage and data quality. + +
+
+
+
+
+ + ♻ ☆ Do LLMs exhibit human-like response biases? A case study in survey + design + + +
+ As large language models (LLMs) become more capable, there is growing +excitement about the possibility of using LLMs as proxies for humans in +real-world tasks where subjective labels are desired, such as in surveys and +opinion polling. One widely-cited barrier to the adoption of LLMs is their +sensitivity to prompt wording - but interestingly, humans also display +sensitivities to instruction changes in the form of response biases. As such, +we argue that if LLMs are going to be used to approximate human opinions, it is +necessary to investigate the extent to which LLMs also reflect human response +biases, if at all. In this work, we use survey design as a case study, where +human response biases caused by permutations in wordings of "prompts" have been +extensively studied. Drawing from prior work in social psychology, we design a +dataset and propose a framework to evaluate whether LLMs exhibit human-like +response biases in survey questionnaires. Our comprehensive evaluation of nine +models shows that popular open and commercial LLMs generally fail to reflect +human-like behavior. These inconsistencies tend to be more prominent in models +that have been instruction fine-tuned. Furthermore, even if a model shows a +significant change in the same direction as humans, we find that perturbations +that are not meant to elicit significant changes in humans may also result in a +similar change. These results highlight the potential pitfalls of using LLMs to +substitute humans in parts of the annotation pipeline, and further underscore +the importance of finer-grained characterizations of model behavior. Our code, +dataset, and collected samples are available at +https://github.com/lindiatjuatja/BiasMonkey + +
+
+
+
+
+ + ♻ ☆ MagicBrush: A Manually Annotated Dataset for Instruction-Guided Image + Editing NeurIPS 2023 + + +
+ Text-guided image editing is widely needed in daily life, ranging from +personal use to professional applications such as Photoshop. However, existing +methods are either zero-shot or trained on an automatically synthesized +dataset, which contains a high volume of noise. Thus, they still require lots +of manual tuning to produce desirable outcomes in practice. To address this +issue, we introduce MagicBrush (https://osu-nlp-group.github.io/MagicBrush/), +the first large-scale, manually annotated dataset for instruction-guided real +image editing that covers diverse scenarios: single-turn, multi-turn, +mask-provided, and mask-free editing. MagicBrush comprises over 10K manually +annotated triplets (source image, instruction, target image), which supports +trainining large-scale text-guided image editing models. We fine-tune +InstructPix2Pix on MagicBrush and show that the new model can produce much +better images according to human evaluation. We further conduct extensive +experiments to evaluate current image editing baselines from multiple +dimensions including quantitative, qualitative, and human evaluations. The +results reveal the challenging nature of our dataset and the gap between +current baselines and real-world editing needs. + +
+
+ comment: NeurIPS 2023; Website: https://osu-nlp-group.github.io/MagicBrush/ +
+
+
+
+
+ + ♻ ☆ Self-Chained Image-Language Model for Video Localization and Question + Answering NeurIPS 2023 + + +
+ Recent studies have shown promising results on utilizing large pre-trained +image-language models for video question answering. While these image-language +models can efficiently bootstrap the representation learning of video-language +models, they typically concatenate uniformly sampled video frames as visual +inputs without explicit language-aware, temporal modeling. When only a portion +of a video input is relevant to the language query, such uniform frame sampling +can often lead to missing important visual cues. Although humans often find a +video moment to focus on and rewind the moment to answer questions, training a +query-aware video moment localizer often requires expensive annotations and +high computational costs. To address this issue, we propose Self-Chained Video +Localization-Answering (SeViLA), a novel framework that leverages a single +image-language model (BLIP-2) to tackle both temporal keyframe localization and +QA on videos. SeViLA framework consists of two modules: Localizer and Answerer, +where both are parameter-efficiently fine-tuned from BLIP-2. We propose two +ways of chaining these modules for cascaded inference and self-refinement. +First, in the forward chain, the Localizer finds multiple language-aware +keyframes in a video, which the Answerer uses to predict the answer. Second, in +the reverse chain, the Answerer generates keyframe pseudo-labels to refine the +Localizer, alleviating the need for expensive video moment localization +annotations. Our SeViLA framework outperforms several strong baselines on 5 +challenging video QA and event prediction benchmarks, and achieves the +state-of-the-art in both fine-tuning (NExT-QA, STAR) and zero-shot (NExT-QA, +STAR, How2QA, VLEP) settings. We also analyze the impact of Localizer, +comparisons of Localizer with other temporal localization models, +pre-training/self-refinement of Localizer, and varying the number of keyframes. + +
+
+ comment: NeurIPS 2023; Our code and checkpoints are available at: + https://github.com/Yui010206/SeViLA +
+
+
+
+
+ + ♻ ☆ The Falcon Series of Open Language Models + + +
+ We introduce the Falcon series: 7B, 40B, and 180B parameters causal +decoder-only models trained on a diverse high-quality corpora predominantly +assembled from web data. The largest model, Falcon-180B, has been trained on +over 3.5 trillion tokens of text--the largest openly documented pretraining +run. Falcon-180B significantly outperforms models such as PaLM or Chinchilla, +and improves upon concurrently developed models such as LLaMA 2 or +Inflection-1. It nears the performance of PaLM-2-Large at a reduced pretraining +and inference cost, making it, to our knowledge, one of the three best language +models in the world along with GPT-4 and PaLM-2-Large. We report detailed +evaluations, as well as a deep dive into the methods and custom tooling +employed to pretrain Falcon. Notably, we report on our custom distributed +training codebase, allowing us to efficiently pretrain these models on up to +4,096 A100s on cloud AWS infrastructure with limited interconnect. We release a +600B tokens extract of our web dataset, as well as the Falcon-7/40/180B models +under a permissive license to foster open-science and accelerate the +development of an open ecosystem of large language models. + +
+
+
+
+
+ + ♻ ☆ LLMs for Science: Usage for Code Generation and Data Analysis + + +
+ Large language models (LLMs) have been touted to enable increased +productivity in many areas of today's work life. Scientific research as an area +of work is no exception: the potential of LLM-based tools to assist in the +daily work of scientists has become a highly discussed topic across +disciplines. However, we are only at the very onset of this subject of study. +It is still unclear how the potential of LLMs will materialise in research +practice. With this study, we give first empirical evidence on the use of LLMs +in the research process. We have investigated a set of use cases for LLM-based +tools in scientific research, and conducted a first study to assess to which +degree current tools are helpful. In this paper we report specifically on use +cases related to software engineering, such as generating application code and +developing scripts for data analytics. While we studied seemingly simple use +cases, results across tools differ significantly. Our results highlight the +promise of LLM-based tools in general, yet we also observe various issues, +particularly regarding the integrity of the output these tools provide. + +
+
+ comment: Preprint; In Submission +
+
+
+
+
+ + ♻ ☆ The impact of responding to patient messages with large language model + assistance + + +
+ Documentation burden is a major contributor to clinician burnout, which is +rising nationally and is an urgent threat to our ability to care for patients. +Artificial intelligence (AI) chatbots, such as ChatGPT, could reduce clinician +burden by assisting with documentation. Although many hospitals are actively +integrating such systems into electronic medical record systems, AI chatbots +utility and impact on clinical decision-making have not been studied for this +intended use. We are the first to examine the utility of large language models +in assisting clinicians draft responses to patient questions. In our two-stage +cross-sectional study, 6 oncologists responded to 100 realistic synthetic +cancer patient scenarios and portal messages developed to reflect common +medical situations, first manually, then with AI assistance. + We find AI-assisted responses were longer, less readable, but provided +acceptable drafts without edits 58% of time. AI assistance improved efficiency +77% of time, with low harm risk (82% safe). However, 7.7% unedited AI responses +could severely harm. In 31% cases, physicians thought AI drafts were +human-written. AI assistance led to more patient education recommendations, +fewer clinical actions than manual responses. Results show promise for AI to +improve clinician efficiency and patient care through assisting documentation, +if used judiciously. Monitoring model outputs and human-AI interaction remains +crucial for safe implementation. + +
+
+ comment: 4 figures and tables in main, submitted for review +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Visual Anagrams: Generating Multi-View Optical Illusions with Diffusion + Models + + +
+ We address the problem of synthesizing multi-view optical illusions: images +that change appearance upon a transformation, such as a flip or rotation. We +propose a simple, zero-shot method for obtaining these illusions from +off-the-shelf text-to-image diffusion models. During the reverse diffusion +process, we estimate the noise from different views of a noisy image. We then +combine these noise estimates together and denoise the image. A theoretical +analysis suggests that this method works precisely for views that can be +written as orthogonal transformations, of which permutations are a subset. This +leads to the idea of a visual anagram--an image that changes appearance under +some rearrangement of pixels. This includes rotations and flips, but also more +exotic pixel permutations such as a jigsaw rearrangement. Our approach also +naturally extends to illusions with more than two views. We provide both +qualitative and quantitative results demonstrating the effectiveness and +flexibility of our method. Please see our project webpage for additional +visualizations and results: https://dangeng.github.io/visual_anagrams/ + +
+
+
+
+
+ + ☆ Do text-free diffusion models learn discriminative visual + representations? + + +
+ While many unsupervised learning models focus on one family of tasks, either +generative or discriminative, we explore the possibility of a unified +representation learner: a model which addresses both families of tasks +simultaneously. We identify diffusion models, a state-of-the-art method for +generative tasks, as a prime candidate. Such models involve training a U-Net to +iteratively predict and remove noise, and the resulting model can synthesize +high-fidelity, diverse, novel images. We find that the intermediate feature +maps of the U-Net are diverse, discriminative feature representations. We +propose a novel attention mechanism for pooling feature maps and further +leverage this mechanism as DifFormer, a transformer feature fusion of features +from different diffusion U-Net blocks and noise steps. We also develop DifFeed, +a novel feedback mechanism tailored to diffusion. We find that diffusion models +are better than GANs, and, with our fusion and feedback mechanisms, can compete +with state-of-the-art unsupervised image representation learning methods for +discriminative tasks - image classification with full and semi-supervision, +transfer for fine-grained classification, object detection and segmentation, +and semantic segmentation. Our project website +(https://mgwillia.github.io/diffssl/) and code +(https://github.com/soumik-kanad/diffssl) are available publicly. + +
+
+ comment: Website: see https://mgwillia.github.io/diffssl/ . Code: see + https://github.com/soumik-kanad/diffssl . The first two authors contributed + equally. 15 pages, 9 figures, 15 tables. Submission under review. arXiv admin + note: text overlap with arXiv:2307.08702 +
+
+
+
+
+ + ☆ A Simple Recipe for Language-guided Domain Generalized Segmentation + + +
+ Generalization to new domains not seen during training is one of the +long-standing goals and challenges in deploying neural networks in real-world +applications. Existing generalization techniques necessitate substantial data +augmentation, potentially sourced from external datasets, and aim at learning +invariant representations by imposing various alignment constraints. +Large-scale pretraining has recently shown promising generalization +capabilities, along with the potential of bridging different modalities. For +instance, the recent advent of vision-language models like CLIP has opened the +doorway for vision models to exploit the textual modality. In this paper, we +introduce a simple framework for generalizing semantic segmentation networks by +employing language as the source of randomization. Our recipe comprises three +key ingredients: i) the preservation of the intrinsic CLIP robustness through +minimal fine-tuning, ii) language-driven local style augmentation, and iii) +randomization by locally mixing the source and augmented styles during +training. Extensive experiments report state-of-the-art results on various +generalization benchmarks. The code will be made available. + +
+
+ comment: Project page: https://astra-vision.github.io/FAMix +
+
+
+
+
+ + ☆ Driving into the Future: Multiview Visual Forecasting and Planning with + World Model for Autonomous Driving + + +
+ In autonomous driving, predicting future events in advance and evaluating the +foreseeable risks empowers autonomous vehicles to better plan their actions, +enhancing safety and efficiency on the road. To this end, we propose Drive-WM, +the first driving world model compatible with existing end-to-end planning +models. Through a joint spatial-temporal modeling facilitated by view +factorization, our model generates high-fidelity multiview videos in driving +scenes. Building on its powerful generation ability, we showcase the potential +of applying the world model for safe driving planning for the first time. +Particularly, our Drive-WM enables driving into multiple futures based on +distinct driving maneuvers, and determines the optimal trajectory according to +the image-based rewards. Evaluation on real-world driving datasets verifies +that our method could generate high-quality, consistent, and controllable +multiview videos, opening up possibilities for real-world simulations and safe +planning. + +
+
+ comment: Project page: https://drive-wm.github.io. Code: + https://github.com/BraveGroup/Drive-WM +
+
+
+
+
+ + ☆ AvatarStudio: High-fidelity and Animatable 3D Avatar Creation from Text + + +
+ We study the problem of creating high-fidelity and animatable 3D avatars from +only textual descriptions. Existing text-to-avatar methods are either limited +to static avatars which cannot be animated or struggle to generate animatable +avatars with promising quality and precise pose control. To address these +limitations, we propose AvatarStudio, a coarse-to-fine generative model that +generates explicit textured 3D meshes for animatable human avatars. +Specifically, AvatarStudio begins with a low-resolution NeRF-based +representation for coarse generation, followed by incorporating SMPL-guided +articulation into the explicit mesh representation to support avatar animation +and high resolution rendering. To ensure view consistency and pose +controllability of the resulting avatars, we introduce a 2D diffusion model +conditioned on DensePose for Score Distillation Sampling supervision. By +effectively leveraging the synergy between the articulated mesh representation +and the DensePose-conditional diffusion model, AvatarStudio can create +high-quality avatars from text that are ready for animation, significantly +outperforming previous methods. Moreover, it is competent for many +applications, e.g., multimodal avatar animations and style-guided avatar +creation. For more results, please refer to our project page: +http://jeff95.me/projects/avatarstudio.html + +
+
+ comment: Project page at http://jeff95.me/projects/avatarstudio.html +
+
+
+
+
+ + ☆ OPERA: Alleviating Hallucination in Multi-Modal Large Language Models + via Over-Trust Penalty and Retrospection-Allocation + + +
+ Hallucination, posed as a pervasive challenge of multi-modal large language +models (MLLMs), has significantly impeded their real-world usage that demands +precise judgment. Existing methods mitigate this issue with either training +with specific designed data or inferencing with external knowledge from other +sources, incurring inevitable additional costs. In this paper, we present +OPERA, a novel MLLM decoding method grounded in an Over-trust Penalty and a +Retrospection-Allocation strategy, serving as a nearly free lunch to alleviate +the hallucination issue without additional data, knowledge, or training. Our +approach begins with an interesting observation that, most hallucinations are +closely tied to the knowledge aggregation patterns manifested in the +self-attention matrix, i.e., MLLMs tend to generate new tokens by focusing on a +few summary tokens, but not all the previous tokens. Such partial over-trust +inclination results in the neglecting of image tokens and describes the image +content with hallucination. Statistically, we observe an 80%$\sim$95% +co-currency rate between hallucination contents and such knowledge aggregation +patterns. Based on the observation, OPERA introduces a penalty term on the +model logits during the beam-search decoding to mitigate the over-trust issue, +along with a rollback strategy that retrospects the presence of summary tokens +in the previously generated tokens, and re-allocate the token selection if +necessary. With extensive experiments, OPERA shows significant +hallucination-mitigating performance on different MLLMs and metrics, proving +its effectiveness and generality. Our code is available at: +https://github.com/shikiw/OPERA. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ HUGS: Human Gaussian Splats + + +
+ Recent advances in neural rendering have improved both training and rendering +times by orders of magnitude. While these methods demonstrate state-of-the-art +quality and speed, they are designed for photogrammetry of static scenes and do +not generalize well to freely moving humans in the environment. In this work, +we introduce Human Gaussian Splats (HUGS) that represents an animatable human +together with the scene using 3D Gaussian Splatting (3DGS). Our method takes +only a monocular video with a small number of (50-100) frames, and it +automatically learns to disentangle the static scene and a fully animatable +human avatar within 30 minutes. We utilize the SMPL body model to initialize +the human Gaussians. To capture details that are not modeled by SMPL (e.g. +cloth, hairs), we allow the 3D Gaussians to deviate from the human body model. +Utilizing 3D Gaussians for animated humans brings new challenges, including the +artifacts created when articulating the Gaussians. We propose to jointly +optimize the linear blend skinning weights to coordinate the movements of +individual Gaussians during animation. Our approach enables novel-pose +synthesis of human and novel view synthesis of both the human and the scene. We +achieve state-of-the-art rendering quality with a rendering speed of 60 FPS +while being ~100x faster to train over previous work. Our code will be +announced here: https://github.com/apple/ml-hugs + +
+
+
+
+
+ + ☆ CG3D: Compositional Generation for Text-to-3D via Gaussian Splatting + + +
+ With the onset of diffusion-based generative models and their ability to +generate text-conditioned images, content generation has received a massive +invigoration. Recently, these models have been shown to provide useful guidance +for the generation of 3D graphics assets. However, existing work in +text-conditioned 3D generation faces fundamental constraints: (i) inability to +generate detailed, multi-object scenes, (ii) inability to textually control +multi-object configurations, and (iii) physically realistic scene composition. +In this work, we propose CG3D, a method for compositionally generating scalable +3D assets that resolves these constraints. We find that explicit Gaussian +radiance fields, parameterized to allow for compositions of objects, possess +the capability to enable semantically and physically consistent scenes. By +utilizing a guidance framework built around this explicit representation, we +show state of the art results, capable of even exceeding the guiding diffusion +model in terms of object combinations and physics accuracy. + +
+
+
+
+
+ + ☆ Language-conditioned Detection Transformer + + +
+ We present a new open-vocabulary detection framework. Our framework uses both +image-level labels and detailed detection annotations when available. Our +framework proceeds in three steps. We first train a language-conditioned object +detector on fully-supervised detection data. This detector gets to see the +presence or absence of ground truth classes during training, and conditions +prediction on the set of present classes. We use this detector to pseudo-label +images with image-level labels. Our detector provides much more accurate +pseudo-labels than prior approaches with its conditioning mechanism. Finally, +we train an unconditioned open-vocabulary detector on the pseudo-annotated +images. The resulting detector, named DECOLA, shows strong zero-shot +performance in open-vocabulary LVIS benchmark as well as direct zero-shot +transfer benchmarks on LVIS, COCO, Object365, and OpenImages. DECOLA +outperforms the prior arts by 17.1 AP-rare and 9.4 mAP on zero-shot LVIS +benchmark. DECOLA achieves state-of-the-art results in various model sizes, +architectures, and datasets by only training on open-sourced data and +academic-scale computing. Code is available at +https://github.com/janghyuncho/DECOLA. + +
+
+ comment: Code is at https://github.com/janghyuncho/DECOLA +
+
+
+
+
+ + ☆ SODA: Bottleneck Diffusion Models for Representation Learning + + +
+ We introduce SODA, a self-supervised diffusion model, designed for +representation learning. The model incorporates an image encoder, which +distills a source view into a compact representation, that, in turn, guides the +generation of related novel views. We show that by imposing a tight bottleneck +between the encoder and a denoising decoder, and leveraging novel view +synthesis as a self-supervised objective, we can turn diffusion models into +strong representation learners, capable of capturing visual semantics in an +unsupervised manner. To the best of our knowledge, SODA is the first diffusion +model to succeed at ImageNet linear-probe classification, and, at the same +time, it accomplishes reconstruction, editing and synthesis tasks across a wide +range of datasets. Further investigation reveals the disentangled nature of its +emergent latent space, that serves as an effective interface to control and +manipulate the model's produced images. All in all, we aim to shed light on the +exciting and promising potential of diffusion models, not only for image +generation, but also for learning rich and robust representations. + +
+
+
+
+
+ + ☆ Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis + + +
+ Hallucinations and unfaithful synthesis due to inaccurate prompts with +insufficient semantic details are widely observed in multimodal generative +models. A prevalent strategy to align multiple modalities is to fine-tune the +generator with a large number of annotated text-image pairs. However, such a +procedure is labor-consuming and resource-draining. The key question we ask is: +can we enhance the quality and faithfulness of text-driven generative models +beyond extensive text-image pair annotations? To address this question, we +propose Knowledge Pursuit Prompting (KPP), a zero-shot framework that +iteratively incorporates external knowledge to help generators produce reliable +visual content. Instead of training generators to handle generic prompts, KPP +employs a recursive knowledge query process to gather informative external +facts from the knowledge base, instructs a language model to compress the +acquired knowledge for prompt refinement, and utilizes text-driven generators +for visual synthesis. The entire process is zero-shot, without accessing the +architectures and parameters of generative models. We evaluate the framework +across multiple text-driven generative tasks (image, 3D rendering, and video) +on datasets of different domains. We further demonstrate the extensibility and +adaptability of KPP through varying foundation model bases and instructions. +Our results show that KPP is capable of generating faithful and semantically +rich content across diverse visual domains, offering a promising solution to +improve multimodal generative models. + +
+
+
+
+
+ + ☆ Betrayed by Attention: A Simple yet Effective Approach for + Self-supervised Video Object Segmentation + + +
+ In this paper, we propose a simple yet effective approach for self-supervised +video object segmentation (VOS). Our key insight is that the inherent +structural dependencies present in DINO-pretrained Transformers can be +leveraged to establish robust spatio-temporal correspondences in videos. +Furthermore, simple clustering on this correspondence cue is sufficient to +yield competitive segmentation results. Previous self-supervised VOS techniques +majorly resort to auxiliary modalities or utilize iterative slot attention to +assist in object discovery, which restricts their general applicability and +imposes higher computational requirements. To deal with these challenges, we +develop a simplified architecture that capitalizes on the emerging objectness +from DINO-pretrained Transformers, bypassing the need for additional modalities +or slot attention. Specifically, we first introduce a single spatio-temporal +Transformer block to process the frame-wise DINO features and establish +spatio-temporal dependencies in the form of self-attention. Subsequently, +utilizing these attention maps, we implement hierarchical clustering to +generate object segmentation masks. To train the spatio-temporal block in a +fully self-supervised manner, we employ semantic and dynamic motion consistency +coupled with entropy normalization. Our method demonstrates state-of-the-art +performance across multiple unsupervised VOS benchmarks and particularly excels +in complex real-world multi-object video segmentation tasks such as +DAVIS-17-Unsupervised and YouTube-VIS-19. The code and model checkpoints will +be released at https://github.com/shvdiwnkozbw/SSL-UVOS. + +
+
+
+
+
+ + ☆ Pose Anything: A Graph-Based Approach for Category-Agnostic Pose + Estimation + + +
+ Traditional 2D pose estimation models are limited by their category-specific +design, making them suitable only for predefined object categories. This +restriction becomes particularly challenging when dealing with novel objects +due to the lack of relevant training data. + To address this limitation, category-agnostic pose estimation (CAPE) was +introduced. CAPE aims to enable keypoint localization for arbitrary object +categories using a single model, requiring minimal support images with +annotated keypoints. This approach not only enables object pose generation +based on arbitrary keypoint definitions but also significantly reduces the +associated costs, paving the way for versatile and adaptable pose estimation +applications. + We present a novel approach to CAPE that leverages the inherent geometrical +relations between keypoints through a newly designed Graph Transformer Decoder. +By capturing and incorporating this crucial structural information, our method +enhances the accuracy of keypoint localization, marking a significant departure +from conventional CAPE techniques that treat keypoints as isolated entities. + We validate our approach on the MP-100 benchmark, a comprehensive dataset +comprising over 20,000 images spanning more than 100 categories. Our method +outperforms the prior state-of-the-art by substantial margins, achieving +remarkable improvements of 2.16% and 1.82% under 1-shot and 5-shot settings, +respectively. Furthermore, our method's end-to-end training demonstrates both +scalability and efficiency compared to previous CAPE approaches. + +
+
+
+
+
+ + ☆ TSDF-Sampling: Efficient Sampling for Neural Surface Field using + Truncated Signed Distance Field + + +
+ Multi-view neural surface reconstruction has exhibited impressive results. +However, a notable limitation is the prohibitively slow inference time when +compared to traditional techniques, primarily attributed to the dense sampling, +required to maintain the rendering quality. This paper introduces a novel +approach that substantially reduces the number of samplings by incorporating +the Truncated Signed Distance Field (TSDF) of the scene. While prior works have +proposed importance sampling, their dependence on initial uniform samples over +the entire space makes them unable to avoid performance degradation when trying +to use less number of samples. In contrast, our method leverages the TSDF +volume generated only by the trained views, and it proves to provide a +reasonable bound on the sampling from upcoming novel views. As a result, we +achieve high rendering quality by fully exploiting the continuous neural SDF +estimation within the bounds given by the TSDF volume. Notably, our method is +the first approach that can be robustly plug-and-play into a diverse array of +neural surface field models, as long as they use the volume rendering +technique. Our empirical results show an 11-fold increase in inference speed +without compromising performance. The result videos are available at our +project page: https://tsdf-sampling.github.io/ + +
+
+
+
+
+ + ☆ Enhancing Post-Hoc Explanation Benchmark Reliability for Image + Classification + + +
+ Deep neural networks, while powerful for image classification, often operate +as "black boxes," complicating the understanding of their decision-making +processes. Various explanation methods, particularly those generating saliency +maps, aim to address this challenge. However, the inconsistency issues of +faithfulness metrics hinder reliable benchmarking of explanation methods. This +paper employs an approach inspired by psychometrics, utilizing Krippendorf's +alpha to quantify the benchmark reliability of post-hoc methods in image +classification. The study proposes model training modifications, including +feeding perturbed samples and employing focal loss, to enhance robustness and +calibration. Empirical evaluations demonstrate significant improvements in +benchmark reliability across metrics, datasets, and post-hoc methods. This +pioneering work establishes a foundation for more reliable evaluation practices +in the realm of post-hoc explanation methods, emphasizing the importance of +model robustness in the assessment process. + +
+
+
+
+
+ + ☆ FisherRF: Active View Selection and Uncertainty Quantification for + Radiance Fields using Fisher Information + + +
+ This study addresses the challenging problem of active view selection and +uncertainty quantification within the domain of Radiance Fields. Neural +Radiance Fields (NeRF) have greatly advanced image rendering and +reconstruction, but the limited availability of 2D images poses uncertainties +stemming from occlusions, depth ambiguities, and imaging errors. Efficiently +selecting informative views becomes crucial, and quantifying NeRF model +uncertainty presents intricate challenges. Existing approaches either depend on +model architecture or are based on assumptions regarding density distributions +that are not generally applicable. By leveraging Fisher Information, we +efficiently quantify observed information within Radiance Fields without ground +truth data. This can be used for the next best view selection and pixel-wise +uncertainty quantification. Our method overcomes existing limitations on model +architecture and effectiveness, achieving state-of-the-art results in both view +selection and uncertainty quantification, demonstrating its potential to +advance the field of Radiance Fields. Our method with the 3D Gaussian Splatting +backend could perform view selections at 70 fps. + +
+
+ comment: Project page: https://jiangwenpl.github.io/FisherRF/ +
+
+
+
+
+ + ☆ Gaussian Shell Maps for Efficient 3D Human Generation + + +
+ Efficient generation of 3D digital humans is important in several industries, +including virtual reality, social media, and cinematic production. 3D +generative adversarial networks (GANs) have demonstrated state-of-the-art +(SOTA) quality and diversity for generated assets. Current 3D GAN +architectures, however, typically rely on volume representations, which are +slow to render, thereby hampering the GAN training and requiring +multi-view-inconsistent 2D upsamplers. Here, we introduce Gaussian Shell Maps +(GSMs) as a framework that connects SOTA generator network architectures with +emerging 3D Gaussian rendering primitives using an articulable multi +shell--based scaffold. In this setting, a CNN generates a 3D texture stack with +features that are mapped to the shells. The latter represent inflated and +deflated versions of a template surface of a digital human in a canonical body +pose. Instead of rasterizing the shells directly, we sample 3D Gaussians on the +shells whose attributes are encoded in the texture features. These Gaussians +are efficiently and differentiably rendered. The ability to articulate the +shells is important during GAN training and, at inference time, to deform a +body into arbitrary user-defined poses. Our efficient rendering scheme bypasses +the need for view-inconsistent upsamplers and achieves high-quality multi-view +consistent renderings at a native resolution of $512 \times 512$ pixels. We +demonstrate that GSMs successfully generate 3D humans when trained on +single-view datasets, including SHHQ and DeepFashion. + +
+
+ comment: Project page : https://rameenabdal.github.io/GaussianShellMaps/ +
+
+
+
+
+ + ☆ Evaluating VLMs for Score-Based, Multi-Probe Annotation of 3D Objects + + +
+ Unlabeled 3D objects present an opportunity to leverage pretrained vision +language models (VLMs) on a range of annotation tasks -- from describing object +semantics to physical properties. An accurate response must take into account +the full appearance of the object in 3D, various ways of phrasing the +question/prompt, and changes in other factors that affect the response. We +present a method to marginalize over any factors varied across VLM queries, +utilizing the VLM's scores for sampled responses. We first show that this +probabilistic aggregation can outperform a language model (e.g., GPT4) for +summarization, for instance avoiding hallucinations when there are contrasting +details between responses. Secondly, we show that aggregated annotations are +useful for prompt-chaining; they help improve downstream VLM predictions (e.g., +of object material when the object's type is specified as an auxiliary input in +the prompt). Such auxiliary inputs allow ablating and measuring the +contribution of visual reasoning over language-only reasoning. Using these +evaluations, we show how VLMs can approach, without additional training or +in-context learning, the quality of human-verified type and material +annotations on the large-scale Objaverse dataset. + +
+
+
+
+
+ + ☆ Towards Real-World Focus Stacking with Deep Learning + + +
+ Focus stacking is widely used in micro, macro, and landscape photography to +reconstruct all-in-focus images from multiple frames obtained with focus +bracketing, that is, with shallow depth of field and different focus planes. +Existing deep learning approaches to the underlying multi-focus image fusion +problem have limited applicability to real-world imagery since they are +designed for very short image sequences (two to four images), and are typically +trained on small, low-resolution datasets either acquired by light-field +cameras or generated synthetically. We introduce a new dataset consisting of 94 +high-resolution bursts of raw images with focus bracketing, with pseudo ground +truth computed from the data using state-of-the-art commercial software. This +dataset is used to train the first deep learning algorithm for focus stacking +capable of handling bursts of sufficient length for real-world applications. +Qualitative experiments demonstrate that it is on par with existing commercial +solutions in the long-burst, realistic regime while being significantly more +tolerant to noise. The code and dataset are available at +https://github.com/araujoalexandre/FocusStackingDataset. + +
+
+
+
+
+ + ☆ Look Before You Leap: Unveiling the Power of GPT-4V in Robotic + Vision-Language Planning + + +
+ In this study, we are interested in imbuing robots with the capability of +physically-grounded task planning. Recent advancements have shown that large +language models (LLMs) possess extensive knowledge useful in robotic tasks, +especially in reasoning and planning. However, LLMs are constrained by their +lack of world grounding and dependence on external affordance models to +perceive environmental information, which cannot jointly reason with LLMs. We +argue that a task planner should be an inherently grounded, unified multimodal +system. To this end, we introduce Robotic Vision-Language Planning (ViLa), a +novel approach for long-horizon robotic planning that leverages vision-language +models (VLMs) to generate a sequence of actionable steps. ViLa directly +integrates perceptual data into its reasoning and planning process, enabling a +profound understanding of commonsense knowledge in the visual world, including +spatial layouts and object attributes. It also supports flexible multimodal +goal specification and naturally incorporates visual feedback. Our extensive +evaluation, conducted in both real-robot and simulated environments, +demonstrates ViLa's superiority over existing LLM-based planners, highlighting +its effectiveness in a wide array of open-world manipulation tasks. + +
+
+
+
+
+ + ☆ SPiC-E : Structural Priors in 3D Diffusion Models using Cross Entity + Attention + + +
+ We are witnessing rapid progress in automatically generating and manipulating +3D assets due to the availability of pretrained text-image diffusion models. +However, time-consuming optimization procedures are required for synthesizing +each sample, hindering their potential for democratizing 3D content creation. +Conversely, 3D diffusion models now train on million-scale 3D datasets, +yielding high-quality text-conditional 3D samples within seconds. In this work, +we present SPiC-E - a neural network that adds structural guidance to 3D +diffusion models, extending their usage beyond text-conditional generation. At +its core, our framework introduces a cross-entity attention mechanism that +allows for multiple entities (in particular, paired input and guidance 3D +shapes) to interact via their internal representations within the denoising +network. We utilize this mechanism for learning task-specific structural priors +in 3D diffusion models from auxiliary guidance shapes. We show that our +approach supports a variety of applications, including 3D stylization, semantic +shape editing and text-conditional abstraction-to-3D, which transforms +primitive-based abstractions into highly-expressive shapes. Extensive +experiments demonstrate that SPiC-E achieves SOTA performance over these tasks +while often being considerably faster than alternative methods. Importantly, +this is accomplished without tailoring our approach for any specific task. + +
+
+ comment: Project webpage: https://tau-vailab.github.io/spic-e +
+
+
+
+
+ + ☆ Analyzing and Explaining Image Classifiers via Diffusion Guidance + + +
+ While deep learning has led to huge progress in complex image classification +tasks like ImageNet, unexpected failure modes, e.g. via spurious features, call +into question how reliably these classifiers work in the wild. Furthermore, for +safety-critical tasks the black-box nature of their decisions is problematic, +and explanations or at least methods which make decisions plausible are needed +urgently. In this paper, we address these problems by generating images that +optimize a classifier-derived objective using a framework for guided image +generation. We analyze the behavior and decisions of image classifiers by +visual counterfactual explanations (VCEs), detection of systematic mistakes by +analyzing images where classifiers maximally disagree, and visualization of +neurons to verify potential spurious features. In this way, we validate +existing observations, e.g. the shape bias of adversarially robust models, as +well as novel failure modes, e.g. systematic errors of zero-shot CLIP +classifiers, or identify harmful spurious features. Moreover, our VCEs +outperform previous work while being more versatile. + +
+
+
+
+
+ + ☆ DAP: Domain-aware Prompt Learning for Vision-and-Language Navigation + + +
+ Following language instructions to navigate in unseen environments is a +challenging task for autonomous embodied agents. With strong representation +capabilities, pretrained vision-and-language models are widely used in VLN. +However, most of them are trained on web-crawled general-purpose datasets, +which incurs a considerable domain gap when used for VLN tasks. To address the +problem, we propose a novel and model-agnostic domain-aware prompt learning +(DAP) framework. For equipping the pretrained models with specific object-level +and scene-level cross-modal alignment in VLN tasks, DAP applies a low-cost +prompt tuning paradigm to learn soft visual prompts for extracting in-domain +image semantics. Specifically, we first generate a set of in-domain image-text +pairs with the help of the CLIP model. Then we introduce soft visual prompts in +the input space of the visual encoder in a pretrained model. DAP injects +in-domain visual knowledge into the visual encoder of the pretrained model in +an efficient way. Experimental results on both R2R and REVERIE show the +superiority of DAP compared to existing state-of-the-art methods. + +
+
+ comment: 4 pages. arXiv admin note: substantial text overlap with + arXiv:2309.03661 +
+
+
+
+
+ + ☆ Coloring the Past: Neural Historical Buildings Reconstruction from + Archival Photography + + +
+ Historical buildings are a treasure and milestone of human cultural heritage. +Reconstructing the 3D models of these building hold significant value. The +rapid development of neural rendering methods makes it possible to recover the +3D shape only based on archival photographs. However, this task presents +considerable challenges due to the limitations of such datasets. Historical +photographs are often limited in number and the scenes in these photos might +have altered over time. The radiometric quality of these images is also often +sub-optimal. To address these challenges, we introduce an approach to +reconstruct the geometry of historical buildings, employing volumetric +rendering techniques. We leverage dense point clouds as a geometric prior and +introduce a color appearance embedding loss to recover the color of the +building given limited available color images. We aim for our work to spark +increased interest and focus on preserving historical buildings. Thus, we also +introduce a new historical dataset of the Hungarian National Theater, providing +a new benchmark for the reconstruction method. + +
+
+
+
+
+ + ☆ Aggregation Model Hyperparameters Matter in Digital Pathology + + +
+ Digital pathology has significantly advanced disease detection and +pathologist efficiency through the analysis of gigapixel whole-slide images +(WSI). In this process, WSIs are first divided into patches, for which a +feature extractor model is applied to obtain feature vectors, which are +subsequently processed by an aggregation model to predict the respective WSI +label. With the rapid evolution of representation learning, numerous new +feature extractor models, often termed foundational models, have emerged. +Traditional evaluation methods, however, rely on fixed aggregation model +hyperparameters, a framework we identify as potentially biasing the results. +Our study uncovers a co-dependence between feature extractor models and +aggregation model hyperparameters, indicating that performance comparability +can be skewed based on the chosen hyperparameters. By accounting for this +co-dependency, we find that the performance of many current feature extractor +models is notably similar. We support this insight by evaluating seven feature +extractor models across three different datasets with 162 different aggregation +model configurations. This comprehensive approach provides a more nuanced +understanding of the relationship between feature extractors and aggregation +models, leading to a fairer and more accurate assessment of feature extractor +models in digital pathology. + +
+
+
+
+
+ + ☆ U-Net v2: Rethinking the Skip Connections of U-Net for Medical Image + Segmentation + + +
+ In this paper, we introduce U-Net v2, a new robust and efficient U-Net +variant for medical image segmentation. It aims to augment the infusion of +semantic information into low-level features while simultaneously refining +high-level features with finer details. For an input image, we begin by +extracting multi-level features with a deep neural network encoder. Next, we +enhance the feature map of each level by infusing semantic information from +higher-level features and integrating finer details from lower-level features +through Hadamard product. Our novel skip connections empower features of all +the levels with enriched semantic characteristics and intricate details. The +improved features are subsequently transmitted to the decoder for further +processing and segmentation. Our method can be seamlessly integrated into any +Encoder-Decoder network. We evaluate our method on several public medical image +segmentation datasets for skin lesion segmentation and polyp segmentation, and +the experimental results demonstrate the segmentation accuracy of our new +method over state-of-the-art methods, while preserving memory and computational +efficiency. Code is available at: https://github.com/yaoppeng/U-Net\_v2 + +
+
+
+
+
+ + ☆ One-Shot Open Affordance Learning with Foundation Models + + +
+ We introduce One-shot Open Affordance Learning (OOAL), where a model is +trained with just one example per base object category, but is expected to +identify novel objects and affordances. While vision-language models excel at +recognizing novel objects and scenes, they often struggle to understand finer +levels of granularity such as affordances. To handle this issue, we conduct a +comprehensive analysis of existing foundation models, to explore their inherent +understanding of affordances and assess the potential for data-limited +affordance learning. We then propose a vision-language framework with simple +and effective designs that boost the alignment between visual features and +affordance text embeddings. Experiments on two affordance segmentation +benchmarks show that the proposed method outperforms state-of-the-art models +with less than 1% of the full training data, and exhibits reasonable +generalization capability on unseen objects and affordances. + +
+
+
+
+
+ + ☆ PillarNeSt: Embracing Backbone Scaling and Pretraining for Pillar-based + 3D Object Detection + + +
+ This paper shows the effectiveness of 2D backbone scaling and pretraining for +pillar-based 3D object detectors. Pillar-based methods mainly employ randomly +initialized 2D convolution neural network (ConvNet) for feature extraction and +fail to enjoy the benefits from the backbone scaling and pretraining in the +image domain. To show the scaling-up capacity in point clouds, we introduce the +dense ConvNet pretrained on large-scale image datasets (e.g., ImageNet) as the +2D backbone of pillar-based detectors. The ConvNets are adaptively designed +based on the model size according to the specific features of point clouds, +such as sparsity and irregularity. Equipped with the pretrained ConvNets, our +proposed pillar-based detector, termed PillarNeSt, outperforms the existing 3D +object detectors by a large margin on the nuScenes and Argoversev2 datasets. +Our code shall be released upon acceptance. + +
+
+
+
+
+ + ☆ Cinematic Behavior Transfer via NeRF-based Differentiable Filming + + +
+ In the evolving landscape of digital media and video production, the precise +manipulation and reproduction of visual elements like camera movements and +character actions are highly desired. Existing SLAM methods face limitations in +dynamic scenes and human pose estimation often focuses on 2D projections, +neglecting 3D statuses. To address these issues, we first introduce a reverse +filming behavior estimation technique. It optimizes camera trajectories by +leveraging NeRF as a differentiable renderer and refining SMPL tracks. We then +introduce a cinematic transfer pipeline that is able to transfer various shot +types to a new 2D video or a 3D virtual environment. The incorporation of 3D +engine workflow enables superior rendering and control abilities, which also +achieves a higher rating in the user study. + +
+
+ comment: Project Page: + https://virtualfilmstudio.github.io/projects/cinetransfer +
+
+
+
+
+ + ☆ BAND-2k: Banding Artifact Noticeable Database for Banding Detection and + Quality Assessment + + +
+ Banding, also known as staircase-like contours, frequently occurs in flat +areas of images/videos processed by the compression or quantization algorithms. +As undesirable artifacts, banding destroys the original image structure, thus +degrading users' quality of experience (QoE). In this paper, we systematically +investigate the banding image quality assessment (IQA) problem, aiming to +detect the image banding artifacts and evaluate their perceptual visual +quality. Considering that the existing image banding databases only contain +limited content sources and banding generation methods, and lack perceptual +quality labels (i.e. mean opinion scores), we first build the largest banding +IQA database so far, named Banding Artifact Noticeable Database (BAND-2k), +which consists of 2,000 banding images generated by 15 compression and +quantization schemes. A total of 23 workers participated in the subjective IQA +experiment, yielding over 214,000 patch-level banding class labels and 44,371 +reliable image-level quality ratings. Subsequently, we develop an effective +no-reference (NR) banding evaluator for banding detection and quality +assessment by leveraging frequency characteristics of banding artifacts. A dual +convolutional neural network is employed to concurrently learn the feature +representation from the high-frequency and low-frequency maps, thereby +enhancing the ability to discern banding artifacts. The quality score of a +banding image is generated by pooling the banding detection maps masked by the +spatial frequency filters. Experiments demonstrate that our banding evaluator +achieves a remarkably high accuracy in banding detection and also exhibits high +SRCC and PLCC results with the perceptual quality labels. These findings unveil +the strong correlations between the intensity of banding artifacts and the +perceptual visual quality, thus validating the necessity of banding quality +assessment. + +
+
+
+
+
+ + ☆ Variational Bayes image restoration with compressive autoencoders + + +
+ Regularization of inverse problems is of paramount importance in +computational imaging. The ability of neural networks to learn efficient image +representations has been recently exploited to design powerful data-driven +regularizers. While state-of-the-art plug-and-play methods rely on an implicit +regularization provided by neural denoisers, alternative Bayesian approaches +consider Maximum A Posteriori (MAP) estimation in the latent space of a +generative model, thus with an explicit regularization. However, +state-of-the-art deep generative models require a huge amount of training data +compared to denoisers. Besides, their complexity hampers the optimization of +the latent MAP. In this work, we propose to use compressive autoencoders for +latent estimation. These networks, which can be seen as variational +autoencoders with a flexible latent prior, are smaller and easier to train than +state-of-the-art generative models. We then introduce the Variational Bayes +Latent Estimation (VBLE) algorithm, which performs this estimation within the +framework of variational inference. This allows for fast and easy (approximate) +posterior sampling. Experimental results on image datasets BSD and FFHQ +demonstrate that VBLE reaches similar performance than state-of-the-art +plug-and-play methods, while being able to quantify uncertainties faster than +other existing posterior sampling techniques. + +
+
+
+
+
+ + ☆ GenZI: Zero-Shot 3D Human-Scene Interaction Generation + + +
+ Can we synthesize 3D humans interacting with scenes without learning from any +3D human-scene interaction data? We propose GenZI, the first zero-shot approach +to generating 3D human-scene interactions. Key to GenZI is our distillation of +interaction priors from large vision-language models (VLMs), which have learned +a rich semantic space of 2D human-scene compositions. Given a natural language +description and a coarse point location of the desired interaction in a 3D +scene, we first leverage VLMs to imagine plausible 2D human interactions +inpainted into multiple rendered views of the scene. We then formulate a robust +iterative optimization to synthesize the pose and shape of a 3D human model in +the scene, guided by consistency with the 2D interaction hypotheses. In +contrast to existing learning-based approaches, GenZI circumvents the +conventional need for captured 3D interaction data, and allows for flexible +control of the 3D interaction synthesis with easy-to-use text prompts. +Extensive experiments show that our zero-shot approach has high flexibility and +generality, making it applicable to diverse scene types, including both indoor +and outdoor environments. + +
+
+ comment: Project page: https://craigleili.github.io/projects/genzi/ Video: + https://youtu.be/ozfs6E0JIMY +
+
+
+
+
+ + ☆ Receler: Reliable Concept Erasing of Text-to-Image Diffusion Models via + Lightweight Erasers + + +
+ Concept erasure in text-to-image diffusion models aims to disable pre-trained +diffusion models from generating images related to a target concept. To perform +reliable concept erasure, the properties of robustness and locality are +desirable. The former refrains the model from producing images associated with +the target concept for any paraphrased or learned prompts, while the latter +preserves the model ability in generating images for non-target concepts. In +this paper, we propose Reliable Concept Erasing via Lightweight Erasers +(Receler), which learns a lightweight Eraser to perform concept erasing and +enhances locality and robustness with the proposed concept-localized +regularization and adversarial prompt learning, respectively. Comprehensive +quantitative and qualitative experiments with various concept prompts verify +the superiority of Receler over the previous erasing methods on the above two +desirable properties. + +
+
+
+
+
+ + ☆ SAMPro3D: Locating SAM Prompts in 3D for Zero-Shot Scene Segmentation + + +
+ We introduce SAMPro3D for zero-shot 3D indoor scene segmentation. Given the +3D point cloud and multiple posed 2D frames of 3D scenes, our approach segments +3D scenes by applying the pretrained Segment Anything Model (SAM) to 2D frames. +Our key idea involves locating 3D points in scenes as natural 3D prompts to +align their projected pixel prompts across frames, ensuring frame-consistency +in both pixel prompts and their SAM-predicted masks. Moreover, we suggest +filtering out low-quality 3D prompts based on feedback from all 2D frames, for +enhancing segmentation quality. We also propose to consolidate different 3D +prompts if they are segmenting the same object, bringing a more comprehensive +segmentation. Notably, our method does not require any additional training on +domain-specific data, enabling us to preserve the zero-shot power of SAM. +Extensive qualitative and quantitative results show that our method +consistently achieves higher quality and more diverse segmentation than +previous zero-shot or fully supervised approaches, and in many cases even +surpasses human-level annotations. The project page can be accessed at +https://mutianxu.github.io/sampro3d/. + +
+
+ comment: Project page: https://mutianxu.github.io/sampro3d/ +
+
+
+
+
+ + ☆ Fair Text-to-Image Diffusion via Fair Mapping + + +
+ In this paper, we address the limitations of existing text-to-image diffusion +models in generating demographically fair results when given human-related +descriptions. These models often struggle to disentangle the target language +context from sociocultural biases, resulting in biased image generation. To +overcome this challenge, we propose Fair Mapping, a general, model-agnostic, +and lightweight approach that modifies a pre-trained text-to-image model by +controlling the prompt to achieve fair image generation. One key advantage of +our approach is its high efficiency. The training process only requires +updating a small number of parameters in an additional linear mapping network. +This not only reduces the computational cost but also accelerates the +optimization process. We first demonstrate the issue of bias in generated +results caused by language biases in text-guided diffusion models. By +developing a mapping network that projects language embeddings into an unbiased +space, we enable the generation of relatively balanced demographic results +based on a keyword specified in the prompt. With comprehensive experiments on +face image generation, we show that our method significantly improves image +generation performance when prompted with descriptions related to human faces. +By effectively addressing the issue of bias, we produce more fair and diverse +image outputs. This work contributes to the field of text-to-image generation +by enhancing the ability to generate images that accurately reflect the +intended demographic characteristics specified in the text. + +
+
+
+
+
+ + ☆ Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using + Reinforcement and Imitation Learning + + +
+ Robotic-assisted surgical systems have demonstrated significant potential in +enhancing surgical precision and minimizing human errors. However, existing +systems lack the ability to accommodate the unique preferences and requirements +of individual surgeons. Additionally, they primarily focus on general surgeries +(e.g., laparoscopy) and are not suitable for highly precise microsurgeries, +such as ophthalmic procedures. Thus, we propose a simulation-based image-guided +approach for surgeon-centered autonomous agents that can adapt to the +individual surgeon's skill level and preferred surgical techniques during +ophthalmic cataract surgery. Our approach utilizes a simulated environment to +train reinforcement and imitation learning agents guided by image data to +perform all tasks of the incision phase of cataract surgery. By integrating the +surgeon's actions and preferences into the training process with the +surgeon-in-the-loop, our approach enables the robot to implicitly learn and +adapt to the individual surgeon's unique approach through demonstrations. This +results in a more intuitive and personalized surgical experience for the +surgeon. Simultaneously, it ensures consistent performance for the autonomous +robotic apprentice. We define and evaluate the effectiveness of our approach +using our proposed metrics; and highlight the trade-off between a generic agent +and a surgeon-centered adapted agent. Moreover, our approach has the potential +to extend to other ophthalmic surgical procedures, opening the door to a new +generation of surgeon-in-the-loop autonomous surgical robots. We provide an +open-source simulation framework for future development and reproducibility. + +
+
+
+
+
+ + ☆ COVIDx CXR-4: An Expanded Multi-Institutional Open-Source Benchmark + Dataset for Chest X-ray Image-Based Computer-Aided COVID-19 Diagnostics + + +
+ The global ramifications of the COVID-19 pandemic remain significant, +exerting persistent pressure on nations even three years after its initial +outbreak. Deep learning models have shown promise in improving COVID-19 +diagnostics but require diverse and larger-scale datasets to improve +performance. In this paper, we introduce COVIDx CXR-4, an expanded +multi-institutional open-source benchmark dataset for chest X-ray image-based +computer-aided COVID-19 diagnostics. COVIDx CXR-4 expands significantly on the +previous COVIDx CXR-3 dataset by increasing the total patient cohort size by +greater than 2.66 times, resulting in 84,818 images from 45,342 patients across +multiple institutions. We provide extensive analysis on the diversity of the +patient demographic, imaging metadata, and disease distributions to highlight +potential dataset biases. To the best of the authors' knowledge, COVIDx CXR-4 +is the largest and most diverse open-source COVID-19 CXR dataset and is made +publicly available as part of an open initiative to advance research to aid +clinicians against the COVID-19 disease. + +
+
+
+
+
+ + ☆ Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in + Autonomous Driving Applications + + +
+ Understanding how the surrounding environment changes is crucial for +performing downstream tasks safely and reliably in autonomous driving +applications. Recent occupancy estimation techniques using only camera images +as input can provide dense occupancy representations of large-scale scenes +based on the current observation. However, they are mostly limited to +representing the current 3D space and do not consider the future state of +surrounding objects along the time axis. To extend camera-only occupancy +estimation into spatiotemporal prediction, we propose Cam4DOcc, a new benchmark +for camera-only 4D occupancy forecasting, evaluating the surrounding scene +changes in a near future. We build our benchmark based on multiple publicly +available datasets, including nuScenes, nuScenes-Occupancy, and Lyft-Level5, +which provides sequential occupancy states of general movable and static +objects, as well as their 3D backward centripetal flow. To establish this +benchmark for future research with comprehensive comparisons, we introduce four +baseline types from diverse camera-based perception and prediction +implementations, including a static-world occupancy model, voxelization of +point cloud prediction, 2D-3D instance-based prediction, and our proposed novel +end-to-end 4D occupancy forecasting network. Furthermore, the standardized +evaluation protocol for preset multiple tasks is also provided to compare the +performance of all the proposed baselines on present and future occupancy +estimation with respect to objects of interest in autonomous driving scenarios. +The dataset and our implementation of all four baselines in the proposed +Cam4DOcc benchmark will be released here: https://github.com/haomo-ai/Cam4DOcc. + +
+
+
+
+
+ + ☆ Volumetric Cloud Field Reconstruction + + +
+ Volumetric phenomena, such as clouds and fog, present a significant challenge +for 3D reconstruction systems due to their translucent nature and their complex +interactions with light. Conventional techniques for reconstructing scattering +volumes rely on controlled setups, limiting practical applications. This paper +introduces an approach to reconstructing volumes from a few input stereo pairs. +We propose a novel deep learning framework that integrates a deep stereo model +with a 3D Convolutional Neural Network (3D CNN) and an advection module, +capable of capturing the shape and dynamics of volumes. The stereo depths are +used to carve empty space around volumes, providing the 3D CNN with a prior for +coping with the lack of input views. Refining our output, the advection module +leverages the temporal evolution of the medium, providing a mechanism to infer +motion and improve temporal consistency. The efficacy of our system is +demonstrated through its ability to estimate density and velocity fields of +large-scale volumes, in this case, clouds, from a sparse set of stereo image +pairs. + +
+
+ comment: Project page at https://cloud-field.github.io +
+
+
+
+
+ + ☆ Multiple Toddler Tracking in Indoor Videos + + +
+ Multiple toddler tracking (MTT) involves identifying and differentiating +toddlers in video footage. While conventional multi-object tracking (MOT) +algorithms are adept at tracking diverse objects, toddlers pose unique +challenges due to their unpredictable movements, various poses, and similar +appearance. Tracking toddlers in indoor environments introduces additional +complexities such as occlusions and limited fields of view. In this paper, we +address the challenges of MTT and propose MTTSort, a customized method built +upon the DeepSort algorithm. MTTSort is designed to track multiple toddlers in +indoor videos accurately. Our contributions include discussing the primary +challenges in MTT, introducing a genetic algorithm to optimize hyperparameters, +proposing an accurate tracking algorithm, and curating the MTTrack dataset +using unbiased AI co-labeling techniques. We quantitatively compare MTTSort to +state-of-the-art MOT methods on MTTrack, DanceTrack, and MOT15 datasets. In our +evaluation, the proposed method outperformed other MOT methods, achieving 0.98, +0.68, and 0.98 in multiple object tracking accuracy (MOTA), higher order +tracking accuracy (HOTA), and iterative and discriminative framework 1 (IDF1) +metrics, respectively. + +
+
+
+
+
+ + ☆ Vulnerability of Automatic Identity Recognition to Audio-Visual + Deepfakes + + +
+ The task of deepfakes detection is far from being solved by speech or vision +researchers. Several publicly available databases of fake synthetic video and +speech were built to aid the development of detection methods. However, +existing databases typically focus on visual or voice modalities and provide no +proof that their deepfakes can in fact impersonate any real person. In this +paper, we present the first realistic audio-visual database of deepfakes +SWAN-DF, where lips and speech are well synchronized and video have high visual +and audio qualities. We took the publicly available SWAN dataset of real videos +with different identities to create audio-visual deepfakes using several models +from DeepFaceLab and blending techniques for face swapping and HiFiVC, DiffVC, +YourTTS, and FreeVC models for voice conversion. From the publicly available +speech dataset LibriTTS, we also created a separate database of only audio +deepfakes LibriTTS-DF using several latest text to speech methods: YourTTS, +Adaspeech, and TorToiSe. We demonstrate the vulnerability of a state of the art +speaker recognition system, such as ECAPA-TDNN-based model from SpeechBrain, to +the synthetic voices. Similarly, we tested face recognition system based on the +MobileFaceNet architecture to several variants of our visual deepfakes. The +vulnerability assessment show that by tuning the existing pretrained deepfake +models to specific identities, one can successfully spoof the face and speaker +recognition systems in more than 90% of the time and achieve a very realistic +looking and sounding fake video of a given person. + +
+
+ comment: 10 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ VIM: Probing Multimodal Large Language Models for Visual Embedded + Instruction Following + + +
+ We introduce VISUAL EMBEDDED INSTRUCTION (VIM), a new framework designed to +evaluate the visual instruction following capability of Multimodal Large +Language Models (MLLMs). As illustrated in Figure 2, VIM challenges the MLLMs +by embedding the instructions into the visual scenes, demanding strong visual +interpretative skills for instruction following. We adapt VIM to various +benchmarks, including VQAv2, MME, MM-Vet, and RefCOCO series, compose a VIM +bench, and probe diverse MLLMs across three distinct in-context learning +settings: Zero Shot, One Shot, and Pair Shot. We observe that there is a +significant performance disparity between the open-source MLLMs and GPT-4V, +implying that their proficiency in visual instruction comprehension is not up +to par. Our results highlight a promising direction for the enhancement of +MLLMs capabilities on instruction following. We aim VIM to serve as a useful +norm for advancing the state of the art and driving further progress in the +field. + +
+
+ comment: 20 pages, 8 figures, 20 tables +
+
+
+
+
+ + ☆ Neural Fields with Thermal Activations for Arbitrary-Scale + Super-Resolution + + +
+ Recent approaches for arbitrary-scale single image super-resolution (ASSR) +have used local neural fields to represent continuous signals that can be +sampled at different rates. However, in such formulation, the point-wise query +of field values does not naturally match the point spread function (PSF) of a +given pixel. In this work we present a novel way to design neural fields such +that points can be queried with a Gaussian PSF, which serves as anti-aliasing +when moving across resolutions for ASSR. We achieve this using a novel +activation function derived from Fourier theory and the heat equation. This +comes at no additional cost: querying a point with a Gaussian PSF in our +framework does not affect computational cost, unlike filtering in the image +domain. Coupled with a hypernetwork, our method not only provides theoretically +guaranteed anti-aliasing, but also sets a new bar for ASSR while also being +more parameter-efficient than previous methods. + +
+
+
+
+
+ + ☆ Erasing the Ephemeral: Joint Camera Refinement and Transient Object + Removal for Street View Synthesis + + +
+ Synthesizing novel views for urban environments is crucial for tasks like +autonomous driving and virtual tours. Compared to object-level or indoor +situations, outdoor settings present unique challenges, such as inconsistency +across frames due to moving vehicles and camera pose drift over lengthy +sequences. In this paper, we introduce a method that tackles these challenges +on view synthesis for outdoor scenarios. We employ a neural point light field +scene representation and strategically detect and mask out dynamic objects to +reconstruct novel scenes without artifacts. Moreover, we simultaneously +optimize camera pose along with the view synthesis process, and thus, we +simultaneously refine both elements. Through validation on real-world urban +datasets, we demonstrate state-of-the-art results in synthesizing novel views +of urban scenes. + +
+
+
+
+
+ + ☆ Efficient Decoder for End-to-End Oriented Object Detection in Remote + Sensing Images + + +
+ Object instances in remote sensing images often distribute with +multi-orientations, varying scales, and dense distribution. These issues bring +challenges to end-to-end oriented object detectors including multi-scale +features alignment and a large number of queries. To address these limitations, +we propose an end-to-end oriented detector equipped with an efficient decoder, +which incorporates two technologies, Rotated RoI attention (RRoI attention) and +Selective Distinct Queries (SDQ). Specifically, RRoI attention effectively +focuses on oriented regions of interest through a cross-attention mechanism and +aligns multi-scale features. SDQ collects queries from intermediate decoder +layers and then filters similar queries to obtain distinct queries. The +proposed SDQ can facilitate the optimization of one-to-one label assignment, +without introducing redundant initial queries or extra auxiliary branches. +Extensive experiments on five datasets demonstrate the effectiveness of our +method. Notably, our method achieves state-of-the-art performance on DIOR-R +(67.31% mAP), DOTA-v1.5 (67.43% mAP), and DOTA-v2.0 (53.28% mAP) with the +ResNet50 backbone. + +
+
+
+
+
+ + ☆ Focus on Query: Adversarial Mining Transformer for Few-Shot Segmentation NeurIPS 2023 + + +
+ Few-shot segmentation (FSS) aims to segment objects of new categories given +only a handful of annotated samples. Previous works focus their efforts on +exploring the support information while paying less attention to the mining of +the critical query branch. In this paper, we rethink the importance of support +information and propose a new query-centric FSS model Adversarial Mining +Transformer (AMFormer), which achieves accurate query image segmentation with +only rough support guidance or even weak support labels. The proposed AMFormer +enjoys several merits. First, we design an object mining transformer (G) that +can achieve the expansion of incomplete region activated by support clue, and a +detail mining transformer (D) to discriminate the detailed local difference +between the expanded mask and the ground truth. Second, we propose to train G +and D via an adversarial process, where G is optimized to generate more +accurate masks approaching ground truth to fool D. We conduct extensive +experiments on commonly used Pascal-5i and COCO-20i benchmarks and achieve +state-of-the-art results across all settings. In addition, the decent +performance with weak support labels in our query-centric paradigm may inspire +the development of more general FSS models. Code will be available at +https://github.com/Wyxdm/AMNet. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ☆ ShapeGPT: 3D Shape Generation with A Unified Multi-modal Language Model + + +
+ The advent of large language models, enabling flexibility through +instruction-driven approaches, has revolutionized many traditional generative +tasks, but large models for 3D data, particularly in comprehensively handling +3D shapes with other modalities, are still under-explored. By achieving +instruction-based shape generations, versatile multimodal generative shape +models can significantly benefit various fields like 3D virtual construction +and network-aided design. In this work, we present ShapeGPT, a shape-included +multi-modal framework to leverage strong pre-trained language models to address +multiple shape-relevant tasks. Specifically, ShapeGPT employs a +word-sentence-paragraph framework to discretize continuous shapes into shape +words, further assembles these words for shape sentences, as well as integrates +shape with instructional text for multi-modal paragraphs. To learn this +shape-language model, we use a three-stage training scheme, including shape +representation, multimodal alignment, and instruction-based generation, to +align shape-language codebooks and learn the intricate correlations among these +modalities. Extensive experiments demonstrate that ShapeGPT achieves comparable +performance across shape-relevant tasks, including text-to-shape, +shape-to-text, shape completion, and shape editing. + +
+
+
+
+
+ + ☆ AnyLens: A Generative Diffusion Model with Any Rendering Lens + + +
+ State-of-the-art diffusion models can generate highly realistic images based +on various conditioning like text, segmentation, and depth. However, an +essential aspect often overlooked is the specific camera geometry used during +image capture. The influence of different optical systems on the final scene +appearance is frequently overlooked. This study introduces a framework that +intimately integrates a text-to-image diffusion model with the particular lens +geometry used in image rendering. Our method is based on a per-pixel coordinate +conditioning method, enabling the control over the rendering geometry. Notably, +we demonstrate the manipulation of curvature properties, achieving diverse +visual effects, such as fish-eye, panoramic views, and spherical texturing +using a single diffusion model. + +
+
+
+
+
+ + ☆ Adversarial Robust Memory-Based Continual Learner + + +
+ Despite the remarkable advances that have been made in continual learning, +the adversarial vulnerability of such methods has not been fully discussed. We +delve into the adversarial robustness of memory-based continual learning +algorithms and observe limited robustness improvement by directly applying +adversarial training techniques. Preliminary studies reveal the twin challenges +for building adversarial robust continual learners: accelerated forgetting in +continual learning and gradient obfuscation in adversarial robustness. In this +study, we put forward a novel adversarial robust memory-based continual learner +that adjusts data logits to mitigate the forgetting of pasts caused by +adversarial samples. Furthermore, we devise a gradient-based data selection +mechanism to overcome the gradient obfuscation caused by limited stored data. +The proposed approach can widely integrate with existing memory-based continual +learning as well as adversarial training algorithms in a plug-and-play way. +Extensive experiments on Split-CIFAR10/100 and Split-Tiny-ImageNet demonstrate +the effectiveness of our approach, achieving up to 8.13% higher accuracy for +adversarial data. + +
+
+
+
+
+ + ☆ Topology-Preserving Adversarial Training + + +
+ Despite the effectiveness in improving the robustness of neural networks, +adversarial training has suffered from the natural accuracy degradation +problem, i.e., accuracy on natural samples has reduced significantly. In this +study, we reveal that natural accuracy degradation is highly related to the +disruption of the natural sample topology in the representation space by +quantitative and qualitative experiments. Based on this observation, we propose +Topology-pReserving Adversarial traINing (TRAIN) to alleviate the problem by +preserving the topology structure of natural samples from a standard model +trained only on natural samples during adversarial training. As an additional +regularization, our method can easily be combined with various popular +adversarial training algorithms in a plug-and-play manner, taking advantage of +both sides. Extensive experiments on CIFAR-10, CIFAR-100, and Tiny ImageNet +show that our proposed method achieves consistent and significant improvements +over various strong baselines in most cases. Specifically, without additional +data, our proposed method achieves up to 8.78% improvement in natural accuracy +and 4.50% improvement in robust accuracy. + +
+
+
+
+
+ + ☆ Query-Relevant Images Jailbreak Large Multi-Modal Models + + +
+ Warning: This paper contains examples of harmful language and images, and +reader discretion is recommended. The security concerns surrounding Large +Language Models (LLMs) have been extensively explored, yet the safety of Large +Multi-Modal Models (LMMs) remains understudied. In our study, we present a +novel visual prompt attack that exploits query-relevant images to jailbreak the +open-source LMMs. Our method creates a composite image from one image generated +by diffusion models and another that displays the text as typography, based on +keywords extracted from a malicious query. We show LLMs can be easily attacked +by our approach, even if the employed Large Language Models are safely aligned. +To evaluate the extent of this vulnerability in open-source LMMs, we have +compiled a substantial dataset encompassing 13 scenarios with a total of 5,040 +text-image pairs, using our presented attack technique. Our evaluation of 12 +cutting-edge LMMs using this dataset shows the vulnerability of existing +multi-modal models on adversarial attacks. This finding underscores the need +for a concerted effort to strengthen and enhance the safety measures of +open-source LMMs against potential malicious exploits. The resource is +available at \href{this https URL}{https://github.com/isXinLiu/MM-SafetyBench}. + +
+
+ comment: Technique report +
+
+
+
+
+ + ☆ Continual Self-supervised Learning: Towards Universal Multi-modal + Medical Data Representation Learning + + +
+ Self-supervised learning is an efficient pre-training method for medical +image analysis. However, current research is mostly confined to +specific-modality data pre-training, consuming considerable time and resources +without achieving universality across different modalities. A straightforward +solution is combining all modality data for joint self-supervised pre-training, +which poses practical challenges. Firstly, our experiments reveal conflicts in +representation learning as the number of modalities increases. Secondly, +multi-modal data collected in advance cannot cover all real-world scenarios. In +this paper, we reconsider versatile self-supervised learning from the +perspective of continual learning and propose MedCoSS, a continuous +self-supervised learning approach for multi-modal medical data. Unlike joint +self-supervised learning, MedCoSS assigns different modality data to different +training stages, forming a multi-stage pre-training process. To balance modal +conflicts and prevent catastrophic forgetting, we propose a rehearsal-based +continual learning method. We introduce the k-means sampling strategy to retain +data from previous modalities and rehearse it when learning new modalities. +Instead of executing the pretext task on buffer data, a feature distillation +strategy and an intra-modal mixup strategy are applied to these data for +knowledge retention. We conduct continuous self-supervised pre-training on a +large-scale multi-modal unlabeled dataset, including clinical reports, X-rays, +CT scans, MRI scans, and pathological images. Experimental results demonstrate +MedCoSS's exceptional generalization ability across nine downstream datasets +and its significant scalability in integrating new modality data. Code and +pre-trained weight are available at https://github.com/yeerwen/MedCoSS. + +
+
+
+
+
+ + ☆ LanGWM: Language Grounded World Model + + +
+ Recent advances in deep reinforcement learning have showcased its potential +in tackling complex tasks. However, experiments on visual control tasks have +revealed that state-of-the-art reinforcement learning models struggle with +out-of-distribution generalization. Conversely, expressing higher-level +concepts and global contexts is relatively easy using language. + Building upon recent success of the large language models, our main objective +is to improve the state abstraction technique in reinforcement learning by +leveraging language for robust action selection. Specifically, we focus on +learning language-grounded visual features to enhance the world model learning, +a model-based reinforcement learning technique. + To enforce our hypothesis explicitly, we mask out the bounding boxes of a few +objects in the image observation and provide the text prompt as descriptions +for these masked objects. Subsequently, we predict the masked objects along +with the surrounding regions as pixel reconstruction, similar to the +transformer-based masked autoencoder approach. + Our proposed LanGWM: Language Grounded World Model achieves state-of-the-art +performance in out-of-distribution test at the 100K interaction steps +benchmarks of iGibson point navigation tasks. Furthermore, our proposed +technique of explicit language-grounded visual representation learning has the +potential to improve models for human-robot interaction because our extracted +visual features are language grounded. + +
+
+
+
+
+ + ☆ SyncTalk: The Devil is in the Synchronization for Talking Head Synthesis + + +
+ Achieving high synchronization in the synthesis of realistic, speech-driven +talking head videos presents a significant challenge. Traditional Generative +Adversarial Networks (GAN) struggle to maintain consistent facial identity, +while Neural Radiance Fields (NeRF) methods, although they can address this +issue, often produce mismatched lip movements, inadequate facial expressions, +and unstable head poses. A lifelike talking head requires synchronized +coordination of subject identity, lip movements, facial expressions, and head +poses. The absence of these synchronizations is a fundamental flaw, leading to +unrealistic and artificial outcomes. To address the critical issue of +synchronization, identified as the "devil" in creating realistic talking heads, +we introduce SyncTalk. This NeRF-based method effectively maintains subject +identity, enhancing synchronization and realism in talking head synthesis. +SyncTalk employs a Face-Sync Controller to align lip movements with speech and +innovatively uses a 3D facial blendshape model to capture accurate facial +expressions. Our Head-Sync Stabilizer optimizes head poses, achieving more +natural head movements. The Portrait-Sync Generator restores hair details and +blends the generated head with the torso for a seamless visual experience. +Extensive experiments and user studies demonstrate that SyncTalk outperforms +state-of-the-art methods in synchronization and realism. We recommend watching +the supplementary video: https://ziqiaopeng.github.io/synctalk + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ CLIPC8: Face liveness detection algorithm based on image-text pairs and + contrastive learning + + +
+ Face recognition technology is widely used in the financial field, and +various types of liveness attack behaviors need to be addressed. Existing +liveness detection algorithms are trained on specific training datasets and +tested on testing datasets, but their performance and robustness in +transferring to unseen datasets are relatively poor. To tackle this issue, we +propose a face liveness detection method based on image-text pairs and +contrastive learning, dividing liveness attack problems in the financial field +into eight categories and using text information to describe the images of +these eight types of attacks. The text encoder and image encoder are used to +extract feature vector representations for the classification description text +and face images, respectively. By maximizing the similarity of positive samples +and minimizing the similarity of negative samples, the model learns shared +representations between images and texts. The proposed method is capable of +effectively detecting specific liveness attack behaviors in certain scenarios, +such as those occurring in dark environments or involving the tampering of ID +card photos. Additionally, it is also effective in detecting traditional +liveness attack methods, such as printing photo attacks and screen remake +attacks. The zero-shot capabilities of face liveness detection on five public +datasets, including NUAA, CASIA-FASD, Replay-Attack, OULU-NPU and MSU-MFSD also +reaches the level of commercial algorithms. The detection capability of +proposed algorithm was verified on 5 types of testing datasets, and the results +show that the method outperformed commercial algorithms, and the detection +rates reached 100% on multiple datasets. Demonstrating the effectiveness and +robustness of introducing image-text pairs and contrastive learning into +liveness detection tasks as proposed in this paper. + +
+
+
+
+
+ + ☆ LGFCTR: Local and Global Feature Convolutional Transformer for Image + Matching + + +
+ Image matching that finding robust and accurate correspondences across images +is a challenging task under extreme conditions. Capturing local and global +features simultaneously is an important way to mitigate such an issue but +recent transformer-based decoders were still stuck in the issues that CNN-based +encoders only extract local features and the transformers lack locality. +Inspired by the locality and implicit positional encoding of convolutions, a +novel convolutional transformer is proposed to capture both local contexts and +global structures more sufficiently for detector-free matching. Firstly, a +universal FPN-like framework captures global structures in self-encoder as well +as cross-decoder by transformers and compensates local contexts as well as +implicit positional encoding by convolutions. Secondly, a novel convolutional +transformer module explores multi-scale long range dependencies by a novel +multi-scale attention and further aggregates local information inside +dependencies for enhancing locality. Finally, a novel regression-based +sub-pixel refinement module exploits the whole fine-grained window features for +fine-level positional deviation regression. The proposed method achieves +superior performances on a wide range of benchmarks. The code will be available +on https://github.com/zwh0527/LGFCTR. + +
+
+ comment: 8 pages of main text, 7 pages of supplementary material, 3 pages of + references, 6 figures in main text and 8 figures in supplementary material, 5 + tables in main text and 2 tables in supplementary material +
+
+
+
+
+ + ☆ An Efficient Illumination Invariant Tiger Detection Framework for + Wildlife Surveillance + + +
+ Tiger conservation necessitates the strategic deployment of multifaceted +initiatives encompassing the preservation of ecological habitats, anti-poaching +measures, and community involvement for sustainable growth in the tiger +population. With the advent of artificial intelligence, tiger surveillance can +be automated using object detection. In this paper, an accurate illumination +invariant framework is proposed based on EnlightenGAN and YOLOv8 for tiger +detection. The fine-tuned YOLOv8 model achieves a mAP score of 61% without +illumination enhancement. The illumination enhancement improves the mAP by +0.7%. The approaches elevate the state-of-the-art performance on the ATRW +dataset by approximately 6% to 7%. + +
+
+ comment: accepted at ICCIS 2023 +
+
+
+
+
+ + ☆ VINNA for Neonates -- Orientation Independence through Latent + Augmentations + + +
+ Fast and accurate segmentation of neonatal brain images is highly desired to +better understand and detect changes during development and disease. Yet, the +limited availability of ground truth datasets, lack of standardized acquisition +protocols, and wide variations of head positioning pose challenges for method +development. A few automated image analysis pipelines exist for newborn brain +MRI segmentation, but they often rely on time-consuming procedures and require +resampling to a common resolution, subject to loss of information due to +interpolation and down-sampling. Without registration and image resampling, +variations with respect to head positions and voxel resolutions have to be +addressed differently. In deep-learning, external augmentations are +traditionally used to artificially expand the representation of spatial +variability, increasing the training dataset size and robustness. However, +these transformations in the image space still require resampling, reducing +accuracy specifically in the context of label interpolation. We recently +introduced the concept of resolution-independence with the Voxel-size +Independent Neural Network framework, VINN. Here, we extend this concept by +additionally shifting all rigid-transforms into the network architecture with a +four degree of freedom (4-DOF) transform module, enabling resolution-aware +internal augmentations (VINNA). In this work we show that VINNA (i) +significantly outperforms state-of-the-art external augmentation approaches, +(ii) effectively addresses the head variations present specifically in newborn +datasets, and (iii) retains high segmentation accuracy across a range of +resolutions (0.5-1.0 mm). The 4-DOF transform module is a powerful, general +approach to implement spatial augmentation without requiring image or label +interpolation. The specific network application to newborns will be made +publicly available as VINNA4neonates. + +
+
+ comment: Under Review at Imaging Neuroscience +
+
+
+
+
+ + ☆ Smooth Video Synthesis with Noise Constraints on Diffusion Models for + One-shot Video Tuning + + +
+ Recent one-shot video tuning methods, which fine-tune the network on a +specific video based on pre-trained text-to-image models (e.g., Stable +Diffusion), are popular in the community because of the flexibility. However, +these methods often produce videos marred by incoherence and inconsistency. To +address these limitations, this paper introduces a simple yet effective noise +constraint across video frames. This constraint aims to regulate noise +predictions across their temporal neighbors, resulting in smooth latents. It +can be simply included as a loss term during the training phase. By applying +the loss to existing one-shot video tuning methods, we significantly improve +the overall consistency and smoothness of the generated videos. Furthermore, we +argue that current video evaluation metrics inadequately capture smoothness. To +address this, we introduce a novel metric that considers detailed features and +their temporal dynamics. Experimental results validate the effectiveness of our +approach in producing smoother videos on various one-shot video tuning +baselines. The source codes and video demos are available at +\href{https://github.com/SPengLiang/SmoothVideo}{https://github.com/SPengLiang/SmoothVideo}. + +
+
+
+
+
+ + ☆ Weakly-Supervised Emotion Transition Learning for Diverse 3D Co-speech + Gesture Generation + + +
+ Generating vivid and emotional 3D co-speech gestures is crucial for virtual +avatar animation in human-machine interaction applications. While the existing +methods enable generating the gestures to follow a single emotion label, they +overlook that long gesture sequence modeling with emotion transition is more +practical in real scenes. In addition, the lack of large-scale available +datasets with emotional transition speech and corresponding 3D human gestures +also limits the addressing of this task. To fulfill this goal, we first +incorporate the ChatGPT-4 and an audio inpainting approach to construct the +high-fidelity emotion transition human speeches. Considering obtaining the +realistic 3D pose annotations corresponding to the dynamically inpainted +emotion transition audio is extremely difficult, we propose a novel weakly +supervised training strategy to encourage authority gesture transitions. +Specifically, to enhance the coordination of transition gestures w.r.t +different emotional ones, we model the temporal association representation +between two different emotional gesture sequences as style guidance and infuse +it into the transition generation. We further devise an emotion mixture +mechanism that provides weak supervision based on a learnable mixed emotion +label for transition gestures. Last, we present a keyframe sampler to supply +effective initial posture cues in long sequences, enabling us to generate +diverse gestures. Extensive experiments demonstrate that our method outperforms +the state-of-the-art models constructed by adapting single emotion-conditioned +counterparts on our newly defined emotion transition task and datasets. + +
+
+ comment: The code and dataset will be released as soon as possible +
+
+
+
+
+ + ☆ HiDiffusion: Unlocking High-Resolution Creativity and Efficiency in + Low-Resolution Trained Diffusion Models + + +
+ We introduce HiDiffusion, a tuning-free framework comprised of +Resolution-Aware U-Net (RAU-Net) and Modified Shifted Window Multi-head +Self-Attention (MSW-MSA) to enable pretrained large text-to-image diffusion +models to efficiently generate high-resolution images (e.g. 1024$\times$1024) +that surpass the training image resolution. Pretrained diffusion models +encounter unreasonable object duplication in generating images beyond the +training image resolution. We attribute it to the mismatch between the feature +map size of high-resolution images and the receptive field of U-Net's +convolution. To address this issue, we propose a simple yet scalable method +named RAU-Net. RAU-Net dynamically adjusts the feature map size to match the +convolution's receptive field in the deep block of U-Net. Another obstacle in +high-resolution synthesis is the slow inference speed of U-Net. Our +observations reveal that the global self-attention in the top block, which +exhibits locality, however, consumes the majority of computational resources. +To tackle this issue, we propose MSW-MSA. Unlike previous window attention +mechanisms, our method uses a much larger window size and dynamically shifts +windows to better accommodate diffusion models. Extensive experiments +demonstrate that our HiDiffusion can scale diffusion models to generate +1024$\times$1024, 2048$\times$2048, or even 4096$\times$4096 resolution images, +while simultaneously reducing inference time by 40\%-60\%, achieving +state-of-the-art performance on high-resolution image synthesis. The most +significant revelation of our work is that a pretrained diffusion model on +low-resolution images is scalable for high-resolution generation without +further tuning. We hope this revelation can provide insights for future +research on the scalability of diffusion models. + +
+
+
+
+
+ + ☆ A publicly available vessel segmentation algorithm for SLO images + + +
+ Background and Objective: Infra-red scanning laser ophthalmoscope (IRSLO) +images are akin to colour fundus photographs in displaying the posterior pole +and retinal vasculature fine detail. While there are many trained networks +readily available for retinal vessel segmentation in colour fundus photographs, +none cater to IRSLO images. Accordingly, we aimed to develop (and release as +open source) a vessel segmentation algorithm tailored specifically to IRSLO +images. Materials and Methods: We used 23 expertly annotated IRSLO images from +the RAVIR dataset, combined with 7 additional images annotated in-house. We +trained a U-Net (convolutional neural network) to label pixels as 'vessel' or +'background'. Results: On an unseen test set (4 images), our model achieved an +AUC of 0.981, and an AUPRC of 0.815. Upon thresholding, it achieved a +sensitivity of 0.844, a specificity of 0.983, and an F1 score of 0.857. +Conclusion: We have made our automatic segmentation algorithm publicly +available and easy to use. Researchers can use the generated vessel maps to +compute metrics such as fractal dimension and vessel density. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Improving Stability during Upsampling -- on the Importance of Spatial + Context + + +
+ State-of-the-art models for pixel-wise prediction tasks such as image +restoration, image segmentation, or disparity estimation, involve several +stages of data resampling, in which the resolution of feature maps is first +reduced to aggregate information and then sequentially increased to generate a +high-resolution output. Several previous works have investigated the effect of +artifacts that are invoked during downsampling and diverse cures have been +proposed that facilitate to improve prediction stability and even robustness +for image classification. However, equally relevant, artifacts that arise +during upsampling have been less discussed. This is significantly relevant as +upsampling and downsampling approaches face fundamentally different challenges. +While during downsampling, aliases and artifacts can be reduced by blurring +feature maps, the emergence of fine details is crucial during upsampling. +Blurring is therefore not an option and dedicated operations need to be +considered. In this work, we are the first to explore the relevance of context +during upsampling by employing convolutional upsampling operations with +increasing kernel size while keeping the encoder unchanged. We find that +increased kernel sizes can in general improve the prediction stability in tasks +such as image restoration or image segmentation, while a block that allows for +a combination of small-size kernels for fine details and large-size kernels for +artifact removal and increased context yields the best results. + +
+
+ comment: Stable upsampling, reduction in spectral artifacts +
+
+
+
+
+ + ☆ The devil is in the fine-grained details: Evaluating open-vocabulary + object detectors for fine-grained understanding + + +
+ Recent advancements in large vision-language models enabled visual object +detection in open-vocabulary scenarios, where object classes are defined in +free-text formats during inference. In this paper, we aim to probe the +state-of-the-art methods for open-vocabulary object detection to determine to +what extent they understand fine-grained properties of objects and their parts. +To this end, we introduce an evaluation protocol based on dynamic vocabulary +generation to test whether models detect, discern, and assign the correct +fine-grained description to objects in the presence of hard-negative classes. +We contribute with a benchmark suite of increasing difficulty and probing +different properties like color, pattern, and material. We further enhance our +investigation by evaluating several state-of-the-art open-vocabulary object +detectors using the proposed protocol and find that most existing solutions, +which shine in standard open-vocabulary benchmarks, struggle to accurately +capture and distinguish finer object details. We conclude the paper by +highlighting the limitations of current methodologies and exploring promising +research directions to overcome the discovered drawbacks. Data and code are +available at https://github.com/lorebianchi98/FG-OVD. + +
+
+
+
+
+ + ☆ MMA-Diffusion: MultiModal Attack on Diffusion Models + + +
+ In recent years, Text-to-Image (T2I) models have seen remarkable +advancements, gaining widespread adoption. However, this progress has +inadvertently opened avenues for potential misuse, particularly in generating +inappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces +MMA-Diffusion, a framework that presents a significant and realistic threat to +the security of T2I models by effectively circumventing current defensive +measures in both open-source models and commercial online services. Unlike +previous approaches, MMA-Diffusion leverages both textual and visual modalities +to bypass safeguards like prompt filters and post-hoc safety checkers, thus +exposing and highlighting the vulnerabilities in existing defense mechanisms. + +
+
+
+
+
+ + ☆ Fusion of Single and Integral Multispectral Aerial Images + + +
+ We present a novel hybrid (model- and learning-based) architecture for fusing +the most significant features from conventional aerial images and integral +aerial images that result from synthetic aperture sensing for removing +occlusion caused by dense vegetation. It combines the environment's spatial +references with features of unoccluded targets. Our method out-beats the +state-of-the-art, does not require manually tuned parameters, can be extended +to an arbitrary number and combinations of spectral channels, and is +reconfigurable to address different use-cases. + +
+
+
+
+
+ + ☆ StructRe: Rewriting for Structured Shape Modeling + + +
+ Man-made 3D shapes are naturally organized in parts and hierarchies; such +structures provide important constraints for shape reconstruction and +generation. Modeling shape structures is difficult, because there can be +multiple hierarchies for a given shape, causing ambiguity, and across different +categories the shape structures are correlated with semantics, limiting +generalization. We present StructRe, a structure rewriting system, as a novel +approach to structured shape modeling. Given a 3D object represented by points +and components, StructRe can rewrite it upward into more concise structures, or +downward into more detailed structures; by iterating the rewriting process, +hierarchies are obtained. Such a localized rewriting process enables +probabilistic modeling of ambiguous structures and robust generalization across +object categories. We train StructRe on PartNet data and show its +generalization to cross-category and multiple object hierarchies, and test its +extension to ShapeNet. We also demonstrate the benefits of probabilistic and +generalizable structure modeling for shape reconstruction, generation and +editing tasks. + +
+
+
+
+
+ + ☆ PViT-6D: Overclocking Vision Transformers for 6D Pose Estimation with + Confidence-Level Prediction and Pose Tokens + + +
+ In the current state of 6D pose estimation, top-performing techniques depend +on complex intermediate correspondences, specialized architectures, and +non-end-to-end algorithms. In contrast, our research reframes the problem as a +straightforward regression task by exploring the capabilities of Vision +Transformers for direct 6D pose estimation through a tailored use of +classification tokens. We also introduce a simple method for determining pose +confidence, which can be readily integrated into most 6D pose estimation +frameworks. This involves modifying the transformer architecture by decreasing +the number of query elements based on the network's assessment of the scene +complexity. Our method that we call Pose Vision Transformer or PViT-6D provides +the benefits of simple implementation and being end-to-end learnable while +outperforming current state-of-the-art methods by +0.3% ADD(-S) on +Linemod-Occlusion and +2.7% ADD(-S) on the YCB-V dataset. Moreover, our method +enhances both the model's interpretability and the reliability of its +performance during inference. + +
+
+
+
+
+ + ☆ Towards Higher Ranks via Adversarial Weight Pruning NeurIPS 2023 + + +
+ Convolutional Neural Networks (CNNs) are hard to deploy on edge devices due +to its high computation and storage complexities. As a common practice for +model compression, network pruning consists of two major categories: +unstructured and structured pruning, where unstructured pruning constantly +performs better. However, unstructured pruning presents a structured pattern at +high pruning rates, which limits its performance. To this end, we propose a +Rank-based PruninG (RPG) method to maintain the ranks of sparse weights in an +adversarial manner. In each step, we minimize the low-rank approximation error +for the weight matrices using singular value decomposition, and maximize their +distance by pushing the weight matrices away from its low rank approximation. +This rank-based optimization objective guides sparse weights towards a +high-rank topology. The proposed method is conducted in a gradual pruning +fashion to stabilize the change of rank during training. Experimental results +on various datasets and different tasks demonstrate the effectiveness of our +algorithm in high sparsity. The proposed RPG outperforms the state-of-the-art +performance by 1.13% top-1 accuracy on ImageNet in ResNet-50 with 98% sparsity. +The codes are available at +https://github.com/huawei-noah/Efficient-Computing/tree/master/Pruning/RPG and +https://gitee.com/mindspore/models/tree/master/research/cv/RPG. + +
+
+ comment: NeurIPS 2023 Accepted +
+
+
+
+
+ + ☆ Spherical Frustum Sparse Convolution Network for LiDAR Point Cloud + Semantic Segmentation + + +
+ LiDAR point cloud semantic segmentation enables the robots to obtain +fine-grained semantic information of the surrounding environment. Recently, +many works project the point cloud onto the 2D image and adopt the 2D +Convolutional Neural Networks (CNNs) or vision transformer for LiDAR point +cloud semantic segmentation. However, since more than one point can be +projected onto the same 2D position but only one point can be preserved, the +previous 2D image-based segmentation methods suffer from inevitable quantized +information loss. To avoid quantized information loss, in this paper, we +propose a novel spherical frustum structure. The points projected onto the same +2D position are preserved in the spherical frustums. Moreover, we propose a +memory-efficient hash-based representation of spherical frustums. Through the +hash-based representation, we propose the Spherical Frustum sparse Convolution +(SFC) and Frustum Fast Point Sampling (F2PS) to convolve and sample the points +stored in spherical frustums respectively. Finally, we present the Spherical +Frustum sparse Convolution Network (SFCNet) to adopt 2D CNNs for LiDAR point +cloud semantic segmentation without quantized information loss. Extensive +experiments on the SemanticKITTI and nuScenes datasets demonstrate that our +SFCNet outperforms the 2D image-based semantic segmentation methods based on +conventional spherical projection. The source code will be released later. + +
+
+ comment: 17 pages, 10 figures, under review +
+
+
+
+
+ + ☆ Non-Visible Light Data Synthesis and Application: A Case Study for + Synthetic Aperture Radar Imagery + + +
+ We explore the "hidden" ability of large-scale pre-trained image generation +models, such as Stable Diffusion and Imagen, in non-visible light domains, +taking Synthetic Aperture Radar (SAR) data for a case study. Due to the +inherent challenges in capturing satellite data, acquiring ample SAR training +samples is infeasible. For instance, for a particular category of ship in the +open sea, we can collect only few-shot SAR images which are too limited to +derive effective ship recognition models. If large-scale models pre-trained +with regular images can be adapted to generating novel SAR images, the problem +is solved. In preliminary study, we found that fine-tuning these models with +few-shot SAR images is not working, as the models can not capture the two +primary differences between SAR and regular images: structure and modality. To +address this, we propose a 2-stage low-rank adaptation method, and we call it +2LoRA. In the first stage, the model is adapted using aerial-view regular image +data (whose structure matches SAR), followed by the second stage where the base +model from the first stage is further adapted using SAR modality data. +Particularly in the second stage, we introduce a novel prototype LoRA (pLoRA), +as an improved version of 2LoRA, to resolve the class imbalance problem in SAR +datasets. For evaluation, we employ the resulting generation model to +synthesize additional SAR data. This augmentation, when integrated into the +training process of SAR classification as well as segmentation models, yields +notably improved performance for minor classes + +
+
+
+
+
+ + ☆ CLiSA: A Hierarchical Hybrid Transformer Model using Orthogonal Cross + Attention for Satellite Image Cloud Segmentation + + +
+ Clouds in optical satellite images are a major concern since their presence +hinders the ability to carry accurate analysis as well as processing. Presence +of clouds also affects the image tasking schedule and results in wastage of +valuable storage space on ground as well as space-based systems. Due to these +reasons, deriving accurate cloud masks from optical remote-sensing images is an +important task. Traditional methods such as threshold-based, spatial filtering +for cloud detection in satellite images suffer from lack of accuracy. In recent +years, deep learning algorithms have emerged as a promising approach to solve +image segmentation problems as it allows pixel-level classification and +semantic-level segmentation. In this paper, we introduce a deep-learning model +based on hybrid transformer architecture for effective cloud mask generation +named CLiSA - Cloud segmentation via Lipschitz Stable Attention network. In +this context, we propose an concept of orthogonal self-attention combined with +hierarchical cross attention model, and we validate its Lipschitz stability +theoretically and empirically. We design the whole setup under adversarial +setting in presence of Lov\'asz-Softmax loss. We demonstrate both qualitative +and quantitative outcomes for multiple satellite image datasets including +Landsat-8, Sentinel-2, and Cartosat-2s. Performing comparative study we show +that our model performs preferably against other state-of-the-art methods and +also provides better generalization in precise cloud extraction from satellite +multi-spectral (MX) images. We also showcase different ablation studies to +endorse our choices corresponding to different architectural elements and +objective functions. + +
+
+ comment: 14 pages, 11 figures, 7 tables +
+
+
+
+
+ + ☆ Slot-Mixup with Subsampling: A Simple Regularization for WSI + Classification + + +
+ Whole slide image (WSI) classification requires repetitive zoom-in and out +for pathologists, as only small portions of the slide may be relevant to +detecting cancer. Due to the lack of patch-level labels, multiple instance +learning (MIL) is a common practice for training a WSI classifier. One of the +challenges in MIL for WSIs is the weak supervision coming only from the +slide-level labels, often resulting in severe overfitting. In response, +researchers have considered adopting patch-level augmentation or applying mixup +augmentation, but their applicability remains unverified. Our approach augments +the training dataset by sampling a subset of patches in the WSI without +significantly altering the underlying semantics of the original slides. +Additionally, we introduce an efficient model (Slot-MIL) that organizes patches +into a fixed number of slots, the abstract representation of patches, using an +attention mechanism. We empirically demonstrate that the subsampling +augmentation helps to make more informative slots by restricting the +over-concentration of attention and to improve interpretability. Finally, we +illustrate that combining our attention-based aggregation model with +subsampling and mixup, which has shown limited compatibility in existing MIL +methods, can enhance both generalization and calibration. Our proposed methods +achieve the state-of-the-art performance across various benchmark datasets +including class imbalance and distribution shifts. + +
+
+
+
+
+ + ☆ AgentAvatar: Disentangling Planning, Driving and Rendering for + Photorealistic Avatar Agents + + +
+ In this study, our goal is to create interactive avatar agents that can +autonomously plan and animate nuanced facial movements realistically, from both +visual and behavioral perspectives. Given high-level inputs about the +environment and agent profile, our framework harnesses LLMs to produce a series +of detailed text descriptions of the avatar agents' facial motions. These +descriptions are then processed by our task-agnostic driving engine into motion +token sequences, which are subsequently converted into continuous motion +embeddings that are further consumed by our standalone neural-based renderer to +generate the final photorealistic avatar animations. These streamlined +processes allow our framework to adapt to a variety of non-verbal avatar +interactions, both monadic and dyadic. Our extensive study, which includes +experiments on both newly compiled and existing datasets featuring two types of +agents -- one capable of monadic interaction with the environment, and the +other designed for dyadic conversation -- validates the effectiveness and +versatility of our approach. To our knowledge, we advanced a leap step by +combining LLMs and neural rendering for generalized non-verbal prediction and +photo-realistic rendering of avatar agents. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ When StyleGAN Meets Stable Diffusion: a $\mathscr{W}_+$ Adapter for + Personalized Image Generation + + +
+ Text-to-image diffusion models have remarkably excelled in producing diverse, +high-quality, and photo-realistic images. This advancement has spurred a +growing interest in incorporating specific identities into generated content. +Most current methods employ an inversion approach to embed a target visual +concept into the text embedding space using a single reference image. However, +the newly synthesized faces either closely resemble the reference image in +terms of facial attributes, such as expression, or exhibit a reduced capacity +for identity preservation. Text descriptions intended to guide the facial +attributes of the synthesized face may fall short, owing to the intricate +entanglement of identity information with identity-irrelevant facial attributes +derived from the reference image. To address these issues, we present the novel +use of the extended StyleGAN embedding space $\mathcal{W}_+$, to achieve +enhanced identity preservation and disentanglement for diffusion models. By +aligning this semantically meaningful human face latent space with +text-to-image diffusion models, we succeed in maintaining high fidelity in +identity preservation, coupled with the capacity for semantic editing. +Additionally, we propose new training objectives to balance the influences of +both prompt and identity conditions, ensuring that the identity-irrelevant +background remains unaffected during facial attribute modifications. Extensive +experiments reveal that our method adeptly generates personalized text-to-image +outputs that are not only compatible with prompt descriptions but also amenable +to common StyleGAN editing directions in diverse settings. Our source code will +be available at \url{https://github.com/csxmli2016/w-plus-adapter}. + +
+
+
+
+
+ + ☆ W-HMR: Human Mesh Recovery in World Space with Weak-supervised Camera + Calibration and Orientation Correction + + +
+ For a long time, in the field of reconstructing 3D human bodies from +monocular images, most methods opted to simplify the task by minimizing the +influence of the camera. Using a coarse focal length setting results in the +reconstructed bodies not aligning well with distorted images. Ignoring camera +rotation leads to an unrealistic reconstructed body pose in world space. +Consequently, existing methods' application scenarios are confined to +controlled environments. And they struggle to achieve accurate and reasonable +reconstruction in world space when confronted with complex and diverse +in-the-wild images. To address the above issues, we propose W-HMR, which +decouples global body recovery into camera calibration, local body recovery and +global body orientation correction. We design the first weak-supervised camera +calibration method for body distortion, eliminating dependence on focal length +labels and achieving finer mesh-image alignment. We propose a novel orientation +correction module to allow the reconstructed human body to remain normal in +world space. Decoupling body orientation and body pose enables our model to +consider the accuracy in camera coordinate and the reasonableness in world +coordinate simultaneously, expanding the range of applications. As a result, +W-HMR achieves high-quality reconstruction in dual coordinate systems, +particularly in challenging scenes. Codes will be released on +https://yw0208.github.io/ after publication. + +
+
+ comment: Project Page: https://yw0208.github.io +
+
+
+
+
+ + ☆ DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with + Diffusion Model + + +
+ Scene flow estimation, which aims to predict per-point 3D displacements of +dynamic scenes, is a fundamental task in the computer vision field. However, +previous works commonly suffer from unreliable correlation caused by locally +constrained searching ranges, and struggle with accumulated inaccuracy arising +from the coarse-to-fine structure. To alleviate these problems, we propose a +novel uncertainty-aware scene flow estimation network (DifFlow3D) with the +diffusion probabilistic model. Iterative diffusion-based refinement is designed +to enhance the correlation robustness and resilience to challenging cases, +e.g., dynamics, noisy inputs, repetitive patterns, etc. To restrain the +generation diversity, three key flow-related features are leveraged as +conditions in our diffusion model. Furthermore, we also develop an uncertainty +estimation module within diffusion to evaluate the reliability of estimated +scene flow. Our DifFlow3D achieves state-of-the-art performance, with 6.7\% and +19.1\% EPE3D reduction respectively on FlyingThings3D and KITTI 2015 datasets. +Notably, our method achieves an unprecedented millimeter-level accuracy +(0.0089m in EPE3D) on the KITTI dataset. Additionally, our diffusion-based +refinement paradigm can be readily integrated as a plug-and-play module into +existing scene flow networks, significantly increasing their estimation +accuracy. Codes will be released later. + +
+
+
+
+
+ + ☆ Continual Learning for Image Segmentation with Dynamic Query + + +
+ Image segmentation based on continual learning exhibits a critical drop of +performance, mainly due to catastrophic forgetting and background shift, as +they are required to incorporate new classes continually. In this paper, we +propose a simple, yet effective Continual Image Segmentation method with +incremental Dynamic Query (CISDQ), which decouples the representation learning +of both old and new knowledge with lightweight query embedding. CISDQ mainly +includes three contributions: 1) We define dynamic queries with adaptive +background class to exploit past knowledge and learn future classes naturally. +2) CISDQ proposes a class/instance-aware Query Guided Knowledge Distillation +strategy to overcome catastrophic forgetting by capturing the inter-class +diversity and intra-class identity. 3) Apart from semantic segmentation, CISDQ +introduce the continual learning for instance segmentation in which +instance-wise labeling and supervision are considered. Extensive experiments on +three datasets for two tasks (i.e., continual semantic and instance +segmentation are conducted to demonstrate that CISDQ achieves the +state-of-the-art performance, specifically, obtaining 4.4% and 2.9% mIoU +improvements for the ADE 100-10 (6 steps) setting and ADE 100-5 (11 steps) +setting. + +
+
+ comment: Code: https://github.com/weijiawu/CisDQ +
+
+
+
+
+ + ☆ Weakly-semi-supervised object detection in remotely sensed imagery NeurIPS 2023 + + +
+ Deep learning for detecting objects in remotely sensed imagery can enable new +technologies for important applications including mitigating climate change. +However, these models often require large datasets labeled with bounding box +annotations which are expensive to curate, prohibiting the development of +models for new tasks and geographies. To address this challenge, we develop +weakly-semi-supervised object detection (WSSOD) models on remotely sensed +imagery which can leverage a small amount of bounding boxes together with a +large amount of point labels that are easy to acquire at scale in geospatial +data. We train WSSOD models which use large amounts of point-labeled images +with varying fractions of bounding box labeled images in FAIR1M and a wind +turbine detection dataset, and demonstrate that they substantially outperform +fully supervised models trained with the same amount of bounding box labeled +images on both datasets. Furthermore, we find that the WSSOD models trained +with 2-10x fewer bounding box labeled images can perform similarly to or +outperform fully supervised models trained on the full set of bounding-box +labeled images. We believe that the approach can be extended to other remote +sensing tasks to reduce reliance on bounding box labels and increase +development of models for impactful applications. + +
+
+ comment: Tackling Climate Change with Machine Learning at NeurIPS 2023 +
+
+
+
+
+ + ☆ MM-Narrator: Narrating Long-form Videos with Multimodal In-Context + Learning + + +
+ We present MM-Narrator, a novel system leveraging GPT-4 with multimodal +in-context learning for the generation of audio descriptions (AD). Unlike +previous methods that primarily focused on downstream fine-tuning with short +video clips, MM-Narrator excels in generating precise audio descriptions for +videos of extensive lengths, even beyond hours, in an autoregressive manner. +This capability is made possible by the proposed memory-augmented generation +process, which effectively utilizes both the short-term textual context and +long-term visual memory through an efficient register-and-recall mechanism. +These contextual memories compile pertinent past information, including +storylines and character identities, ensuring an accurate tracking and +depicting of story-coherent and character-centric audio descriptions. +Maintaining the training-free design of MM-Narrator, we further propose a +complexity-based demonstration selection strategy to largely enhance its +multi-step reasoning capability via few-shot multimodal in-context learning +(MM-ICL). Experimental results on MAD-eval dataset demonstrate that MM-Narrator +consistently outperforms both the existing fine-tuning-based approaches and +LLM-based approaches in most scenarios, as measured by standard evaluation +metrics. Additionally, we introduce the first segment-based evaluator for +recurrent text generation. Empowered by GPT-4, this evaluator comprehensively +reasons and marks AD generation performance in various extendable dimensions. + +
+
+ comment: Project page at https://mm-narrator.github.io/ +
+
+
+
+
+ + ☆ Group-wise Sparse and Explainable Adversarial Attacks + + +
+ Sparse adversarial attacks fool deep neural networks (DNNs) through minimal +pixel perturbations, typically regularized by the $\ell_0$ norm. Recent efforts +have replaced this norm with a structural sparsity regularizer, such as the +nuclear group norm, to craft group-wise sparse adversarial attacks. The +resulting perturbations are thus explainable and hold significant practical +relevance, shedding light on an even greater vulnerability of DNNs than +previously anticipated. However, crafting such attacks poses an optimization +challenge, as it involves computing norms for groups of pixels within a +non-convex objective. In this paper, we tackle this challenge by presenting an +algorithm that simultaneously generates group-wise sparse attacks within +semantically meaningful areas of an image. In each iteration, the core +operation of our algorithm involves the optimization of a quasinorm adversarial +loss. This optimization is achieved by employing the $1/2$-quasinorm proximal +operator for some iterations, a method tailored for nonconvex programming. +Subsequently, the algorithm transitions to a projected Nesterov's accelerated +gradient descent with $2$-norm regularization applied to perturbation +magnitudes. We rigorously evaluate the efficacy of our novel attack in both +targeted and non-targeted attack scenarios, on CIFAR-10 and ImageNet datasets. +When compared to state-of-the-art methods, our attack consistently results in a +remarkable increase in group-wise sparsity, e.g., an increase of $48.12\%$ on +CIFAR-10 and $40.78\%$ on ImageNet (average case, targeted attack), all while +maintaining lower perturbation magnitudes. Notably, this performance is +complemented by a significantly faster computation time and a $100\%$ attack +success rate. + +
+
+
+
+
+ + ☆ SigFormer: Sparse Signal-Guided Transformer for Multi-Modal Human Action + Segmentation + + +
+ Multi-modal human action segmentation is a critical and challenging task with +a wide range of applications. Nowadays, the majority of approaches concentrate +on the fusion of dense signals (i.e., RGB, optical flow, and depth maps). +However, the potential contributions of sparse IoT sensor signals, which can be +crucial for achieving accurate recognition, have not been fully explored. To +make up for this, we introduce a Sparse signalguided Transformer (SigFormer) to +combine both dense and sparse signals. We employ mask attention to fuse +localized features by constraining cross-attention within the regions where +sparse signals are valid. However, since sparse signals are discrete, they lack +sufficient information about the temporal action boundaries. Therefore, in +SigFormer, we propose to emphasize the boundary information at two stages to +alleviate this problem. In the first feature extraction stage, we introduce an +intermediate bottleneck module to jointly learn both category and boundary +features of each dense modality through the inner loss functions. After the +fusion of dense modalities and sparse signals, we then devise a two-branch +architecture that explicitly models the interrelationship between action +category and temporal boundary. Experimental results demonstrate that SigFormer +outperforms the state-of-the-art approaches on a multi-modal action +segmentation dataset from real industrial environments, reaching an outstanding +F1 score of 0.958. The codes and pre-trained models have been available at +https://github.com/LIUQI-creat/SigFormer. + +
+
+
+
+
+ + ☆ SpeechAct: Towards Generating Whole-body Motion from Speech + + +
+ This paper addresses the problem of generating whole-body motion from speech. +Despite great successes, prior methods still struggle to produce reasonable and +diverse whole-body motions from speech. This is due to their reliance on +suboptimal representations and a lack of strategies for generating diverse +results. To address these challenges, we present a novel hybrid point +representation to achieve accurate and continuous motion generation, e.g., +avoiding foot skating, and this representation can be transformed into an +easy-to-use representation, i.e., SMPL-X body mesh, for many applications. To +generate whole-body motion from speech, for facial motion, closely tied to the +audio signal, we introduce an encoder-decoder architecture to achieve +deterministic outcomes. However, for the body and hands, which have weaker +connections to the audio signal, we aim to generate diverse yet reasonable +motions. To boost diversity in motion generation, we propose a contrastive +motion learning method to encourage the model to produce more distinctive +representations. Specifically, we design a robust VQ-VAE to learn a quantized +motion codebook using our hybrid representation. Then, we regress the motion +representation from the audio signal by a translation model employing our +contrastive motion learning method. Experimental results validate the superior +performance and the correctness of our model. The project page is available for +research purposes at http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct. + +
+
+ comment: Project page: http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct +
+
+
+
+
+ + ☆ Talking Head(?) Anime from a Single Image 4: Improved Model and Its + Distillation + + +
+ We study the problem of creating a character model that can be controlled in +real time from a single image of an anime character. A solution to this problem +would greatly reduce the cost of creating avatars, computer games, and other +interactive applications. + Talking Head Anime 3 (THA3) is an open source project that attempts to +directly addresses the problem. It takes as input (1) an image of an anime +character's upper body and (2) a 45-dimensional pose vector and outputs a new +image of the same character taking the specified pose. The range of possible +movements is expressive enough for personal avatars and certain types of game +characters. However, the system is too slow to generate animations in real time +on common PCs, and its image quality can be improved. + In this paper, we improve THA3 in two ways. First, we propose new +architectures for constituent networks that rotate the character's head and +body based on U-Nets with attention that are widely used in modern generative +models. The new architectures consistently yield better image quality than the +THA3 baseline. Nevertheless, they also make the whole system much slower: it +takes up to 150 milliseconds to generate a frame. Second, we propose a +technique to distill the system into a small network (less than 2 MB) that can +generate 512x512 animation frames in real time (under 30 FPS) using consumer +gaming GPUs while keeping the image quality close to that of the full system. +This improvement makes the whole system practical for real-time applications. + +
+
+
+
+
+ + ☆ Dynamic Dense Graph Convolutional Network for Skeleton-based Human + Motion Prediction + + +
+ Graph Convolutional Networks (GCN) which typically follows a neural message +passing framework to model dependencies among skeletal joints has achieved high +success in skeleton-based human motion prediction task. Nevertheless, how to +construct a graph from a skeleton sequence and how to perform message passing +on the graph are still open problems, which severely affect the performance of +GCN. To solve both problems, this paper presents a Dynamic Dense Graph +Convolutional Network (DD-GCN), which constructs a dense graph and implements +an integrated dynamic message passing. More specifically, we construct a dense +graph with 4D adjacency modeling as a comprehensive representation of motion +sequence at different levels of abstraction. Based on the dense graph, we +propose a dynamic message passing framework that learns dynamically from data +to generate distinctive messages reflecting sample-specific relevance among +nodes in the graph. Extensive experiments on benchmark Human 3.6M and CMU Mocap +datasets verify the effectiveness of our DD-GCN which obviously outperforms +state-of-the-art GCN-based methods, especially when using long-term and our +proposed extremely long-term protocol. + +
+
+
+
+
+ + ☆ VITATECS: A Diagnostic Dataset for Temporal Concept Understanding of + Video-Language Models + + +
+ The ability to perceive how objects change over time is a crucial ingredient +in human intelligence. However, current benchmarks cannot faithfully reflect +the temporal understanding abilities of video-language models (VidLMs) due to +the existence of static visual shortcuts. To remedy this issue, we present +VITATECS, a diagnostic VIdeo-Text dAtaset for the evaluation of TEmporal +Concept underStanding. Specifically, we first introduce a fine-grained taxonomy +of temporal concepts in natural language in order to diagnose the capability of +VidLMs to comprehend different temporal aspects. Furthermore, to disentangle +the correlation between static and temporal information, we generate +counterfactual video descriptions that differ from the original one only in the +specified temporal aspect. We employ a semi-automatic data collection framework +using large language models and human-in-the-loop annotation to obtain +high-quality counterfactual descriptions efficiently. Evaluation of +representative video-language understanding models confirms their deficiency in +temporal understanding, revealing the need for greater emphasis on the temporal +elements in video-language research. + +
+
+ comment: 23 pages, 6 figures, 18 tables, data is available at + https://github.com/lscpku/VITATECS +
+
+
+
+
+ + ☆ Spectral and Polarization Vision: Spectro-polarimetric Real-world + Dataset + + +
+ Image datasets are essential not only in validating existing methods in +computer vision but also in developing new methods. Most existing image +datasets focus on trichromatic intensity images to mimic human vision. However, +polarization and spectrum, the wave properties of light that animals in harsh +environments and with limited brain capacity often rely on, remain +underrepresented in existing datasets. Although spectro-polarimetric datasets +exist, these datasets have insufficient object diversity, limited illumination +conditions, linear-only polarization data, and inadequate image count. Here, we +introduce two spectro-polarimetric datasets: trichromatic Stokes images and +hyperspectral Stokes images. These novel datasets encompass both linear and +circular polarization; they introduce multiple spectral channels; and they +feature a broad selection of real-world scenes. With our dataset in hand, we +analyze the spectro-polarimetric image statistics, develop efficient +representations of such high-dimensional data, and evaluate spectral dependency +of shape-from-polarization methods. As such, the proposed dataset promises a +foundation for data-driven spectro-polarimetric imaging and vision research. +Dataset and code will be publicly available. + +
+
+
+
+
+ + ☆ 360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization + with Cross-device Queries + + +
+ Portable 360$^\circ$ cameras are becoming a cheap and efficient tool to +establish large visual databases. By capturing omnidirectional views of a +scene, these cameras could expedite building environment models that are +essential for visual localization. However, such an advantage is often +overlooked due to the lack of valuable datasets. This paper introduces a new +benchmark dataset, 360Loc, composed of 360$^\circ$ images with ground truth +poses for visual localization. We present a practical implementation of +360$^\circ$ mapping combining 360$^\circ$ images with lidar data to generate +the ground truth 6DoF poses. 360Loc is the first dataset and benchmark that +explores the challenge of cross-device visual positioning, involving +360$^\circ$ reference frames, and query frames from pinhole, ultra-wide FoV +fisheye, and 360$^\circ$ cameras. We propose a virtual camera approach to +generate lower-FoV query frames from 360$^\circ$ images, which ensures a fair +comparison of performance among different query types in visual localization +tasks. We also extend this virtual camera approach to feature matching-based +and pose regression-based methods to alleviate the performance loss caused by +the cross-device domain gap, and evaluate its effectiveness against +state-of-the-art baselines. We demonstrate that omnidirectional visual +localization is more robust in challenging large-scale scenes with symmetries +and repetitive structures. These results provide new insights into 360-camera +mapping and omnidirectional visual localization with cross-device queries. + +
+
+
+
+
+ + ☆ Two Scalable Approaches for Burned-Area Mapping Using U-Net and Landsat + Imagery + + +
+ Monitoring wildfires is an essential step in minimizing their impact on the +planet, understanding the many negative environmental, economic, and social +consequences. Recent advances in remote sensing technology combined with the +increasing application of artificial intelligence methods have improved +real-time, high-resolution fire monitoring. This study explores two proposed +approaches based on the U-Net model for automating and optimizing the +burned-area mapping process. Denoted 128 and AllSizes (AS), they are trained on +datasets with a different class balance by cropping input images to different +sizes. They are then applied to Landsat imagery and time-series data from two +fire-prone regions in Chile. The results obtained after enhancement of model +performance by hyperparameter optimization demonstrate the effectiveness of +both approaches. Tests based on 195 representative images of the study area +show that increasing dataset balance using the AS model yields better +performance. More specifically, AS exhibited a Dice Coefficient (DC) of 0.93, +an Omission Error (OE) of 0.086, and a Commission Error (CE) of 0.045, while +the 128 model achieved a DC of 0.86, an OE of 0.12, and a CE of 0.12. These +findings should provide a basis for further development of scalable automatic +burned-area mapping tools. + +
+
+
+
+
+ + ☆ Generative Hierarchical Temporal Transformer for Hand Action Recognition + and Motion Prediction + + +
+ We present a novel framework that concurrently tackles hand action +recognition and 3D future hand motion prediction. While previous works focus on +either recognition or prediction, we propose a generative Transformer VAE +architecture to jointly capture both aspects, facilitating realistic motion +prediction by leveraging the short-term hand motion and long-term action +consistency observed across timestamps.To ensure faithful representation of the +semantic dependency and different temporal granularity of hand pose and action, +our framework is decomposed into two cascaded VAE blocks. The lower pose block +models short-span poses, while the upper action block models long-span action. +These are connected by a mid-level feature that represents sub-second series of +hand poses.Our framework is trained across multiple datasets, where pose and +action blocks are trained separately to fully utilize pose-action annotations +of different qualities. Evaluations show that on multiple datasets, the joint +modeling of recognition and prediction improves over separate solutions, and +the semantic and temporal hierarchy enables long-term pose and action modeling. + +
+
+
+
+
+ + ☆ Symbol-LLM: Leverage Language Models for Symbolic System in Visual Human + Activity Reasoning NeurIPS 2023 + + +
+ Human reasoning can be understood as a cooperation between the intuitive, +associative "System-1" and the deliberative, logical "System-2". For existing +System-1-like methods in visual activity understanding, it is crucial to +integrate System-2 processing to improve explainability, generalization, and +data efficiency. One possible path of activity reasoning is building a symbolic +system composed of symbols and rules, where one rule connects multiple symbols, +implying human knowledge and reasoning abilities. Previous methods have made +progress, but are defective with limited symbols from handcraft and limited +rules from visual-based annotations, failing to cover the complex patterns of +activities and lacking compositional generalization. To overcome the defects, +we propose a new symbolic system with two ideal important properties: +broad-coverage symbols and rational rules. Collecting massive human knowledge +via manual annotations is expensive to instantiate this symbolic system. +Instead, we leverage the recent advancement of LLMs (Large Language Models) as +an approximation of the two ideal properties, i.e., Symbols from Large Language +Models (Symbol-LLM). Then, given an image, visual contents from the images are +extracted and checked as symbols and activity semantics are reasoned out based +on rules via fuzzy logic calculation. Our method shows superiority in extensive +activity understanding tasks. Code and data are available at +https://mvig-rhos.com/symbol_llm. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ☆ How does spatial structure affect psychological restoration? A method + based on Graph Neural Networks and Street View Imagery + + +
+ The Attention Restoration Theory (ART) presents a theoretical framework with +four essential indicators (being away, extent, fascinating, and compatibility) +for comprehending urban and natural restoration quality. However, previous +studies relied on non-sequential data and non-spatial dependent methods, which +overlooks the impact of spatial structure defined here as the positional +relationships between scene entities on restoration quality. The past methods +also make it challenging to measure restoration quality on an urban scale. In +this work, a spatial-dependent graph neural networks (GNNs) approach is +proposed to reveal the relation between spatial structure and restoration +quality on an urban scale. Specifically, we constructed two different types of +graphs at the street and city levels. The street-level graphs, using sequential +street view images (SVIs) of road segments to capture position relationships +between entities, were used to represent spatial structure. The city-level +graph, modeling the topological relationships of roads as non-Euclidean data +structures and embedding urban features (including Perception-features, +Spatial-features, and Socioeconomic-features), was used to measure restoration +quality. The results demonstrate that: 1) spatial-dependent GNNs model +outperforms traditional methods (Acc = 0.735, F1 = 0.732); 2) spatial structure +portrayed through sequential SVIs data significantly influences restoration +quality; 3) spaces with the same restoration quality exhibited distinct spatial +structures patterns. This study clarifies the association between spatial +structure and restoration quality, providing a new perspective to improve urban +well-being in the future. + +
+
+ comment: 33 pages, 7 figures, Under review +
+
+
+
+
+ + ☆ A natural language processing-based approach: mapping human perception + by understanding deep semantic features in street view images + + +
+ In the past decade, using Street View images and machine learning to measure +human perception has become a mainstream research approach in urban science. +However, this approach using only image-shallow information makes it difficult +to comprehensively understand the deep semantic features of human perception of +a scene. In this study, we proposed a new framework based on a pre-train +natural language model to understand the relationship between human perception +and the sense of a scene. Firstly, Place Pulse 2.0 was used as our base +dataset, which contains a variety of human-perceived labels, namely, beautiful, +safe, wealthy, depressing, boring, and lively. An image captioning network was +used to extract the description information of each street view image. +Secondly, a pre-trained BERT model was finetuning and added a regression +function for six human perceptual dimensions. Furthermore, we compared the +performance of five traditional regression methods with our approach and +conducted a migration experiment in Hong Kong. Our results show that human +perception scoring by deep semantic features performed better than previous +studies by machine learning methods with shallow features. The use of deep +scene semantic features provides new ideas for subsequent human perception +research, as well as better explanatory power in the face of spatial +heterogeneity. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ Efficient Stitchable Task Adaptation + + +
+ The paradigm of pre-training and fine-tuning has laid the foundation for +deploying deep learning models. However, most fine-tuning methods are designed +to meet a specific resource budget. Recently, considering diverse deployment +scenarios with various resource budgets, stitchable neural network (SN-Net) is +introduced to quickly obtain numerous new networks (stitches) from the +pre-trained models (anchors) in a model family via model stitching. Although +promising, SN-Net confronts new challenges when adapting it to new target +domains, including huge memory and storage requirements and a long and +sub-optimal multistage adaptation process. In this work, we present a novel +framework, Efficient Stitchable Task Adaptation (ESTA), to efficiently produce +a palette of fine-tuned models that adhere to diverse resource constraints. +Specifically, we first tailor parameter-efficient fine-tuning to share low-rank +updates among the stitches while maintaining independent bias terms. In this +way, we largely reduce fine-tuning memory burdens and mitigate the interference +among stitches that arises in task adaptation. Furthermore, we streamline a +simple yet effective one-stage deployment pipeline, which estimates the +important stitches to deploy with training-time gradient statistics. By +assigning higher sampling probabilities to important stitches, we also get a +boosted Pareto frontier. Extensive experiments on 25 downstream visual +recognition tasks demonstrate that our ESTA is capable of generating stitches +with smooth accuracy-efficiency trade-offs and surpasses the direct SN-Net +adaptation by remarkable margins with significantly lower training time and +fewer trainable parameters. Furthermore, we demonstrate the flexibility and +scalability of our ESTA framework by stitching LLMs from LLaMA family, +obtaining chatbot stitches of assorted sizes. + +
+
+ comment: Source code will be released at + https://github.com/ziplab/Stitched_LLaMA +
+
+
+
+
+ + ☆ Implicit-explicit Integrated Representations for Multi-view Video + Compression + + +
+ With the increasing consumption of 3D displays and virtual reality, +multi-view video has become a promising format. However, its high resolution +and multi-camera shooting result in a substantial increase in data volume, +making storage and transmission a challenging task. To tackle these +difficulties, we propose an implicit-explicit integrated representation for +multi-view video compression. Specifically, we first use the explicit +representation-based 2D video codec to encode one of the source views. +Subsequently, we propose employing the implicit neural representation +(INR)-based codec to encode the remaining views. The implicit codec takes the +time and view index of multi-view video as coordinate inputs and generates the +corresponding implicit reconstruction frames.To enhance the compressibility, we +introduce a multi-level feature grid embedding and a fully convolutional +architecture into the implicit codec. These components facilitate +coordinate-feature and feature-RGB mapping, respectively. To further enhance +the reconstruction quality from the INR codec, we leverage the high-quality +reconstructed frames from the explicit codec to achieve inter-view +compensation. Finally, the compensated results are fused with the implicit +reconstructions from the INR to obtain the final reconstructed frames. Our +proposed framework combines the strengths of both implicit neural +representation and explicit 2D codec. Extensive experiments conducted on public +datasets demonstrate that the proposed framework can achieve comparable or even +superior performance to the latest multi-view video compression standard MIV +and other INR-based schemes in terms of view compression and scene modeling. + +
+
+
+
+
+ + ☆ Cross-Scope Spatial-Spectral Information Aggregation for Hyperspectral + Image Super-Resolution + + +
+ Hyperspectral image super-resolution has attained widespread prominence to +enhance the spatial resolution of hyperspectral images. However, +convolution-based methods have encountered challenges in harnessing the global +spatial-spectral information. The prevailing transformer-based methods have not +adequately captured the long-range dependencies in both spectral and spatial +dimensions. To alleviate this issue, we propose a novel cross-scope +spatial-spectral Transformer (CST) to efficiently investigate long-range +spatial and spectral similarities for single hyperspectral image +super-resolution. Specifically, we devise cross-attention mechanisms in spatial +and spectral dimensions to comprehensively model the long-range +spatial-spectral characteristics. By integrating global information into the +rectangle-window self-attention, we first design a cross-scope spatial +self-attention to facilitate long-range spatial interactions. Then, by +leveraging appropriately characteristic spatial-spectral features, we construct +a cross-scope spectral self-attention to effectively capture the intrinsic +correlations among global spectral bands. Finally, we elaborate a concise +feed-forward neural network to enhance the feature representation capacity in +the Transformer structure. Extensive experiments over three hyperspectral +datasets demonstrate that the proposed CST is superior to other +state-of-the-art methods both quantitatively and visually. The code is +available at \url{https://github.com/Tomchenshi/CST.git}. + +
+
+
+
+
+ + ☆ RADAP: A Robust and Adaptive Defense Against Diverse Adversarial Patches + on Face Recognition + + +
+ Face recognition (FR) systems powered by deep learning have become widely +used in various applications. However, they are vulnerable to adversarial +attacks, especially those based on local adversarial patches that can be +physically applied to real-world objects. In this paper, we propose RADAP, a +robust and adaptive defense mechanism against diverse adversarial patches in +both closed-set and open-set FR systems. RADAP employs innovative techniques, +such as FCutout and F-patch, which use Fourier space sampling masks to improve +the occlusion robustness of the FR model and the performance of the patch +segmenter. Moreover, we introduce an edge-aware binary cross-entropy (EBCE) +loss function to enhance the accuracy of patch detection. We also present the +split and fill (SAF) strategy, which is designed to counter the vulnerability +of the patch segmenter to complete white-box adaptive attacks. We conduct +comprehensive experiments to validate the effectiveness of RADAP, which shows +significant improvements in defense performance against various adversarial +patches, while maintaining clean accuracy higher than that of the undefended +Vanilla model. + +
+
+
+
+
+ + ☆ VideoAssembler: Identity-Consistent Video Generation with Reference + Entities using Diffusion Model + + +
+ Identity-consistent video generation seeks to synthesize videos that are +guided by both textual prompts and reference images of entities. Current +approaches typically utilize cross-attention layers to integrate the appearance +of the entity, which predominantly captures semantic attributes, resulting in +compromised fidelity of entities. Moreover, these methods necessitate iterative +fine-tuning for each new entity encountered, thereby limiting their +applicability. To address these challenges, we introduce VideoAssembler, a +novel end-to-end framework for identity-consistent video generation that can +conduct inference directly when encountering new entities. VideoAssembler is +adept at producing videos that are not only flexible with respect to the input +reference entities but also responsive to textual conditions. Additionally, by +modulating the quantity of input images for the entity, VideoAssembler enables +the execution of tasks ranging from image-to-video generation to sophisticated +video editing. VideoAssembler comprises two principal components: the Reference +Entity Pyramid (REP) encoder and the Entity-Prompt Attention Fusion (EPAF) +module. The REP encoder is designed to infuse comprehensive appearance details +into the denoising stages of the stable diffusion model. Concurrently, the EPAF +module is utilized to integrate text-aligned features effectively. Furthermore, +to mitigate the challenge of scarce data, we present a methodology for the +preprocessing of training data. Our evaluation of the VideoAssembler framework +on the UCF-101, MSR-VTT, and DAVIS datasets indicates that it achieves good +performances in both quantitative and qualitative analyses (346.84 in FVD and +48.01 in IS on UCF-101). Our project page is at +https://videoassembler.github.io/videoassembler. + +
+
+
+
+
+ + ♻ ☆ SuGaR: Surface-Aligned Gaussian Splatting for Efficient 3D Mesh + Reconstruction and High-Quality Mesh Rendering + + +
+ We propose a method to allow precise and extremely fast mesh extraction from +3D Gaussian Splatting. Gaussian Splatting has recently become very popular as +it yields realistic rendering while being significantly faster to train than +NeRFs. It is however challenging to extract a mesh from the millions of tiny 3D +gaussians as these gaussians tend to be unorganized after optimization and no +method has been proposed so far. Our first key contribution is a regularization +term that encourages the gaussians to align well with the surface of the scene. +We then introduce a method that exploits this alignment to extract a mesh from +the Gaussians using Poisson reconstruction, which is fast, scalable, and +preserves details, in contrast to the Marching Cubes algorithm usually applied +to extract meshes from Neural SDFs. Finally, we introduce an optional +refinement strategy that binds gaussians to the surface of the mesh, and +jointly optimizes these Gaussians and the mesh through Gaussian splatting +rendering. This enables easy editing, sculpting, rigging, animating, +compositing and relighting of the Gaussians using traditional softwares by +manipulating the mesh instead of the gaussians themselves. Retrieving such an +editable mesh for realistic rendering is done within minutes with our method, +compared to hours with the state-of-the-art methods on neural SDFs, while +providing a better rendering quality. Our project page is the following: +https://imagine.enpc.fr/~guedona/sugar/ + +
+
+ comment: We identified a minor typographical error in Equation 6; We updated + the paper accordingly. Project Webpage: + https://imagine.enpc.fr/~guedona/sugar/ +
+
+
+
+
+ + ♻ ☆ Cross-Axis Transformer with 2D Rotary Embeddings + + +
+ Despite lagging behind their modal cousins in many respects, Vision +Transformers have provided an interesting opportunity to bridge the gap between +sequence modeling and image modeling. Up until now however, vision transformers +have largely been held back, due to both computational inefficiency, and lack +of proper handling of spatial dimensions. In this paper, we introduce the +Cross-Axis Transformer. CAT is a model inspired by both Axial Transformers, and +Microsoft's recent Retentive Network, that drastically reduces the required +number of floating point operations required to process an image, while +simultaneously converging faster and more accurately than the Vision +Transformers it replaces. + +
+
+ comment: 7 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ HumanNorm: Learning Normal Diffusion Model for High-quality and + Realistic 3D Human Generation + + +
+ Recent text-to-3D methods employing diffusion models have made significant +advancements in 3D human generation. However, these approaches face challenges +due to the limitations of text-to-image diffusion models, which lack an +understanding of 3D structures. Consequently, these methods struggle to achieve +high-quality human generation, resulting in smooth geometry and cartoon-like +appearances. In this paper, we propose HumanNorm, a novel approach for +high-quality and realistic 3D human generation. The main idea is to enhance the +model's 2D perception of 3D geometry by learning a normal-adapted diffusion +model and a normal-aligned diffusion model. The normal-adapted diffusion model +can generate high-fidelity normal maps corresponding to user prompts with +view-dependent and body-aware text. The normal-aligned diffusion model learns +to generate color images aligned with the normal maps, thereby transforming +physical geometry details into realistic appearance. Leveraging the proposed +normal diffusion model, we devise a progressive geometry generation strategy +and a multi-step Score Distillation Sampling (SDS) loss to enhance the +performance of 3D human generation. Comprehensive experiments substantiate +HumanNorm's ability to generate 3D humans with intricate geometry and realistic +appearances. HumanNorm outperforms existing text-to-3D methods in both geometry +and texture quality. The project page of HumanNorm is +https://humannorm.github.io/. + +
+
+ comment: The project page of HumanNorm is https://humannorm.github.io/ +
+
+
+
+
+ + ♻ ☆ A Unified Approach for Text- and Image-guided 4D Scene Generation + + +
+ Large-scale diffusion generative models are greatly simplifying image, video +and 3D asset creation from user-provided text prompts and images. However, the +challenging problem of text-to-4D dynamic 3D scene generation with diffusion +guidance remains largely unexplored. We propose Dream-in-4D, which features a +novel two-stage approach for text-to-4D synthesis, leveraging (1) 3D and 2D +diffusion guidance to effectively learn a high-quality static 3D asset in the +first stage; (2) a deformable neural radiance field that explicitly +disentangles the learned static asset from its deformation, preserving quality +during motion learning; and (3) a multi-resolution feature grid for the +deformation field with a displacement total variation loss to effectively learn +motion with video diffusion guidance in the second stage. Through a user +preference study, we demonstrate that our approach significantly advances image +and motion quality, 3D consistency and text fidelity for text-to-4D generation +compared to baseline approaches. Thanks to its motion-disentangled +representation, Dream-in-4D can also be easily adapted for controllable +generation where appearance is defined by one or multiple images, without the +need to modify the motion learning stage. Thus, our method offers, for the +first time, a unified approach for text-to-4D, image-to-4D and personalized 4D +generation tasks. + +
+
+ comment: Project page: https://research.nvidia.com/labs/nxp/dream-in-4d/ +
+
+
+
+
+ + ♻ ☆ Efficient In-Context Learning in Vision-Language Models for Egocentric + Videos + + +
+ Recent advancements in text-only large language models (LLMs) have +highlighted the benefit of in-context learning for adapting to new tasks with a +few demonstrations. However, extending in-context learning to large +vision-language models (VLMs) using a huge amount of naturalistic +vision-language data has shown limited success, particularly for egocentric +videos, due to high data collection costs. We propose a novel training method +$\mathbb{E}$fficient $\mathbb{I}$n-context $\mathbb{L}$earning on +$\mathbb{E}$gocentric $\mathbb{V}$ideos ($\mathbb{EILEV}$), which elicits +in-context learning in VLMs for egocentric videos without requiring massive, +naturalistic egocentric video datasets. $\mathbb{EILEV}$ involves architectural +and training data adaptations to allow the model to process contexts +interleaved with video clips and narrations, sampling of in-context examples +with clusters of similar verbs and nouns, use of data with skewed marginal +distributions with a long tail of infrequent verbs and nouns, as well as +homonyms and synonyms. Our evaluations show that $\mathbb{EILEV}$-trained +models outperform larger VLMs trained on a huge amount of naturalistic data in +in-context learning. Furthermore, they can generalize to not only +out-of-distribution, but also novel, rare egocentric videos and texts via +in-context learning, demonstrating potential for applications requiring +cost-effective training, and rapid post-deployment adaptability. Our code and +demo are available at \url{https://github.com/yukw777/EILEV}. + +
+
+ comment: 10 pages, LaTeX; added acknowledgments +
+
+
+
+
+ + ♻ ☆ To See is to Believe: Prompting GPT-4V for Better Visual Instruction + Tuning + + +
+ Existing visual instruction tuning methods typically prompt large language +models with textual descriptions to generate instruction-following data. +Despite the promising performance achieved, these descriptions are derived from +image annotations, which are oftentimes coarse-grained. Furthermore, the +instructions might even contradict the visual content without observing the +entire visual context. To address this challenge, we introduce a fine-grained +visual instruction dataset, LVIS-Instruct4V, which contains 220K visually +aligned and context-aware instructions produced by prompting the powerful +GPT-4V with images from LVIS. Through experimental validation and case studies, +we demonstrate that high-quality visual instructional data could improve the +performance of LLaVA-1.5, a state-of-the-art large multimodal model, across a +wide spectrum of benchmarks by clear margins. Notably, by simply replacing the +LLaVA-Instruct with our LVIS-Instruct4V, we achieve better results than LLaVA +on most challenging LMM benchmarks, e.g., LLaVA$^w$ (76.7 vs. 70.7) and MM-Vet +(40.2 vs. 35.4). We release our data and model at +https://github.com/X2FD/LVIS-INSTRUCT4V. + +
+
+ comment: techical report; work in progress +
+
+
+
+
+ + ♻ ☆ Soulstyler: Using Large Language Model to Guide Image Style Transfer for + Target Object ICASSP2024 + + +
+ Image style transfer occupies an important place in both computer graphics +and computer vision. However, most current methods require reference to +stylized images and cannot individually stylize specific objects. To overcome +this limitation, we propose the "Soulstyler" framework, which allows users to +guide the stylization of specific objects in an image through simple textual +descriptions. We introduce a large language model to parse the text and +identify stylization goals and specific styles. Combined with a CLIP-based +semantic visual embedding encoder, the model understands and matches text and +image content. We also introduce a novel localized text-image block matching +loss that ensures that style transfer is performed only on specified target +objects, while non-target regions remain in their original style. Experimental +results demonstrate that our model is able to accurately perform style transfer +on target objects according to textual descriptions without affecting the style +of background regions. Our code will be available at +https://github.com/yisuanwang/Soulstyler. + +
+
+ comment: 5 pages,3 figures,ICASSP2024 +
+
+
+
+
+ + ♻ ☆ Consistent Video-to-Video Transfer Using Synthetic Dataset + + +
+ We introduce a novel and efficient approach for text-based video-to-video +editing that eliminates the need for resource-intensive per-video-per-model +finetuning. At the core of our approach is a synthetic paired video dataset +tailored for video-to-video transfer tasks. Inspired by Instruct Pix2Pix's +image transfer via editing instruction, we adapt this paradigm to the video +domain. Extending the Prompt-to-Prompt to videos, we efficiently generate +paired samples, each with an input video and its edited counterpart. Alongside +this, we introduce the Long Video Sampling Correction during sampling, ensuring +consistent long videos across batches. Our method surpasses current methods +like Tune-A-Video, heralding substantial progress in text-based video-to-video +editing and suggesting exciting avenues for further exploration and deployment. + +
+
+
+
+
+ + ♻ ☆ SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction + + +
+ 3D occupancy prediction is an important task for the robustness of +vision-centric autonomous driving, which aims to predict whether each point is +occupied in the surrounding 3D space. Existing methods usually require 3D +occupancy labels to produce meaningful results. However, it is very laborious +to annotate the occupancy status of each voxel. In this paper, we propose +SelfOcc to explore a self-supervised way to learn 3D occupancy using only video +sequences. We first transform the images into the 3D space (e.g., bird's eye +view) to obtain 3D representation of the scene. We directly impose constraints +on the 3D representations by treating them as signed distance fields. We can +then render 2D images of previous and future frames as self-supervision signals +to learn the 3D representations. We propose an MVS-embedded strategy to +directly optimize the SDF-induced weights with multiple depth proposals. Our +SelfOcc outperforms the previous best method SceneRF by 58.7% using a single +frame as input on SemanticKITTI and is the first self-supervised work that +produces reasonable 3D occupancy for surround cameras on nuScenes. SelfOcc +produces high-quality depth and achieves state-of-the-art results on novel +depth synthesis, monocular depth estimation, and surround-view depth estimation +on the SemanticKITTI, KITTI-2015, and nuScenes, respectively. Code: +https://github.com/huang-yh/SelfOcc. + +
+
+ comment: Code is available at: https://github.com/huang-yh/SelfOcc +
+
+
+
+
+ + ♻ ☆ Towards Learning Monocular 3D Object Localization From 2D Labels using + the Physical Laws of Motion + + +
+ We present a novel method for precise 3D object localization in single images +from a single calibrated camera using only 2D labels. No expensive 3D labels +are needed. Thus, instead of using 3D labels, our model is trained with +easy-to-annotate 2D labels along with the physical knowledge of the object's +motion. Given this information, the model can infer the latent third dimension, +even though it has never seen this information during training. Our method is +evaluated on both synthetic and real-world datasets, and we are able to achieve +a mean distance error of just 6 cm in our experiments on real data. The results +indicate the method's potential as a step towards learning 3D object location +estimation, where collecting 3D data for training is not feasible. + +
+
+
+
+
+ + ♻ ☆ GC-MVSNet: Multi-View, Multi-Scale, Geometrically-Consistent Multi-View + Stereo WACV 2024 + + +
+ Traditional multi-view stereo (MVS) methods rely heavily on photometric and +geometric consistency constraints, but newer machine learning-based MVS methods +check geometric consistency across multiple source views only as a +post-processing step. In this paper, we present a novel approach that +explicitly encourages geometric consistency of reference view depth maps across +multiple source views at different scales during learning (see Fig. 1). We find +that adding this geometric consistency loss significantly accelerates learning +by explicitly penalizing geometrically inconsistent pixels, reducing the +training iteration requirements to nearly half that of other MVS methods. Our +extensive experiments show that our approach achieves a new state-of-the-art on +the DTU and BlendedMVS datasets, and competitive results on the Tanks and +Temples benchmark. To the best of our knowledge, GC-MVSNet is the first attempt +to enforce multi-view, multi-scale geometric consistency during learning. + +
+
+ comment: Accepted in WACV 2024 +
+
+
+
+
+ + ♻ ☆ A Feasibility Study on Indoor Localization and Multi-person Tracking + Using Sparsely Distributed Camera Network with Edge Computing + + +
+ Camera-based activity monitoring systems are becoming an attractive solution +for smart building applications with the advances in computer vision and edge +computing technologies. In this paper, we present a feasibility study and +systematic analysis of a camera-based indoor localization and multi-person +tracking system implemented on edge computing devices within a large indoor +space. To this end, we deployed an end-to-end edge computing pipeline that +utilizes multiple cameras to achieve localization, body orientation estimation +and tracking of multiple individuals within a large therapeutic space spanning +$1700m^2$, all while maintaining a strong focus on preserving privacy. Our +pipeline consists of 39 edge computing camera systems equipped with Tensor +Processing Units (TPUs) placed in the indoor space's ceiling. To ensure the +privacy of individuals, a real-time multi-person pose estimation algorithm runs +on the TPU of the computing camera system. This algorithm extracts poses and +bounding boxes, which are utilized for indoor localization, body orientation +estimation, and multi-person tracking. Our pipeline demonstrated an average +localization error of 1.41 meters, a multiple-object tracking accuracy score of +88.6\%, and a mean absolute body orientation error of 29\degree. These results +shows that localization and tracking of individuals in a large indoor space is +feasible even with the privacy constrains. + +
+
+
+
+
+ + ♻ ☆ PKU-I2IQA: An Image-to-Image Quality Assessment Database for AI + Generated Images + + +
+ As image generation technology advances, AI-based image generation has been +applied in various fields and Artificial Intelligence Generated Content (AIGC) +has garnered widespread attention. However, the development of AI-based image +generative models also brings new problems and challenges. A significant +challenge is that AI-generated images (AIGI) may exhibit unique distortions +compared to natural images, and not all generated images meet the requirements +of the real world. Therefore, it is of great significance to evaluate AIGIs +more comprehensively. Although previous work has established several human +perception-based AIGC image quality assessment (AIGCIQA) databases for +text-generated images, the AI image generation technology includes scenarios +like text-to-image and image-to-image, and assessing only the images generated +by text-to-image models is insufficient. To address this issue, we establish a +human perception-based image-to-image AIGCIQA database, named PKU-I2IQA. We +conduct a well-organized subjective experiment to collect quality labels for +AIGIs and then conduct a comprehensive analysis of the PKU-I2IQA database. +Furthermore, we have proposed two benchmark models: NR-AIGCIQA based on the +no-reference image quality assessment method and FR-AIGCIQA based on the +full-reference image quality assessment method. Finally, leveraging this +database, we conduct benchmark experiments and compare the performance of the +proposed benchmark models. The PKU-I2IQA database and benchmarks will be +released to facilitate future research on +\url{https://github.com/jiquan123/I2IQA}. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ Natural & Adversarial Bokeh Rendering via Circle-of-Confusion Predictive + Network + + +
+ Bokeh effect is a natural shallow depth-of-field phenomenon that blurs the +out-of-focus part in photography. In recent years, a series of works have +proposed automatic and realistic bokeh rendering methods for artistic and +aesthetic purposes. They usually employ cutting-edge data-driven deep +generative networks with complex training strategies and network architectures. +However, these works neglect that the bokeh effect, as a real phenomenon, can +inevitably affect the subsequent visual intelligent tasks like recognition, and +their data-driven nature prevents them from studying the influence of +bokeh-related physical parameters (i.e., depth-of-the-field) on the intelligent +tasks. To fill this gap, we study a totally new problem, i.e., natural & +adversarial bokeh rendering, which consists of two objectives: rendering +realistic and natural bokeh and fooling the visual perception models (i.e., +bokeh-based adversarial attack). To this end, beyond the pure data-driven +solution, we propose a hybrid alternative by taking the respective advantages +of data-driven and physical-aware methods. Specifically, we propose the +circle-of-confusion predictive network (CoCNet) by taking the all-in-focus +image and depth image as inputs to estimate circle-of-confusion parameters for +each pixel, which are employed to render the final image through a well-known +physical model of bokeh. With the hybrid solution, our method could achieve +more realistic rendering results with the naive training strategy and a much +lighter network. + +
+
+ comment: 11 pages, accepted by TMM +
+
+
+
+
+ + ♻ ☆ CADS: Unleashing the Diversity of Diffusion Models through + Condition-Annealed Sampling + + +
+ While conditional diffusion models are known to have good coverage of the +data distribution, they still face limitations in output diversity, +particularly when sampled with a high classifier-free guidance scale for +optimal image quality or when trained on small datasets. We attribute this +problem to the role of the conditioning signal in inference and offer an +improved sampling strategy for diffusion models that can increase generation +diversity, especially at high guidance scales, with minimal loss of sample +quality. Our sampling strategy anneals the conditioning signal by adding +scheduled, monotonically decreasing Gaussian noise to the conditioning vector +during inference to balance diversity and condition alignment. Our +Condition-Annealed Diffusion Sampler (CADS) can be used with any pretrained +model and sampling algorithm, and we show that it boosts the diversity of +diffusion models in various conditional generation tasks. Further, using an +existing pretrained diffusion model, CADS achieves a new state-of-the-art FID +of 1.70 and 2.31 for class-conditional ImageNet generation at 256$\times$256 +and 512$\times$512 respectively. + +
+
+
+
+
+ + ♻ ☆ TCDM: Transformational Complexity Based Distortion Metric for Perceptual + Point Cloud Quality Assessment + + +
+ The goal of objective point cloud quality assessment (PCQA) research is to +develop quantitative metrics that measure point cloud quality in a perceptually +consistent manner. Merging the research of cognitive science and intuition of +the human visual system (HVS), in this paper, we evaluate the point cloud +quality by measuring the complexity of transforming the distorted point cloud +back to its reference, which in practice can be approximated by the code length +of one point cloud when the other is given. For this purpose, we first make +space segmentation for the reference and distorted point clouds based on a 3D +Voronoi diagram to obtain a series of local patch pairs. Next, inspired by the +predictive coding theory, we utilize a space-aware vector autoregressive +(SA-VAR) model to encode the geometry and color channels of each reference +patch with and without the distorted patch, respectively. Assuming that the +residual errors follow the multi-variate Gaussian distributions, the +self-complexity of the reference and transformational complexity between the +reference and distorted samples are computed using covariance matrices. +Additionally, the prediction terms generated by SA-VAR are introduced as one +auxiliary feature to promote the final quality prediction. The effectiveness of +the proposed transformational complexity based distortion metric (TCDM) is +evaluated through extensive experiments conducted on five public point cloud +quality assessment databases. The results demonstrate that TCDM achieves +state-of-the-art (SOTA) performance, and further analysis confirms its +robustness in various scenarios. The code is publicly available at +https://github.com/zyj1318053/TCDM. + +
+
+
+
+
+ + ♻ ☆ Unified-modal Salient Object Detection via Adaptive Prompt Learning + + +
+ Existing single-modal and multi-modal salient object detection (SOD) methods +focus on designing specific architectures tailored for their respective tasks. +However, developing completely different models for different tasks leads to +labor and time consumption, as well as high computational and practical +deployment costs. In this paper, we make the first attempt to address both +single-modal and multi-modal SOD in a unified framework called UniSOD. +Nevertheless, assigning appropriate strategies to modality variable inputs is +challenging. To this end, UniSOD learns modality-aware prompts with +task-specific hints through adaptive prompt learning, which are plugged into +the proposed pre-trained baseline SOD model to handle corresponding tasks, +while only requiring few learnable parameters compared to training the entire +model. Each modality-aware prompt is generated from a switchable prompt +generation block, which performs structural switching solely relied on +single-modal and multi-modal inputs. UniSOD achieves consistent performance +improvement on 14 benchmark datasets for RGB, RGB-D, and RGB-T SOD, which +demonstrates that our method effectively and efficiently unifies single-modal +and multi-modal SOD tasks. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ FairTune: Optimizing Parameter Efficient Fine Tuning for Fairness in + Medical Image Analysis + + +
+ Training models with robust group fairness properties is crucial in ethically +sensitive application areas such as medical diagnosis. Despite the growing body +of work aiming to minimise demographic bias in AI, this problem remains +challenging. A key reason for this challenge is the fairness generalisation +gap: High-capacity deep learning models can fit all training data nearly +perfectly, and thus also exhibit perfect fairness during training. In this +case, bias emerges only during testing when generalisation performance differs +across subgroups. This motivates us to take a bi-level optimisation perspective +on fair learning: Optimising the learning strategy based on validation +fairness. Specifically, we consider the highly effective workflow of adapting +pre-trained models to downstream medical imaging tasks using +parameter-efficient fine-tuning (PEFT) techniques. There is a trade-off between +updating more parameters, enabling a better fit to the task of interest vs. +fewer parameters, potentially reducing the generalisation gap. To manage this +tradeoff, we propose FairTune, a framework to optimise the choice of PEFT +parameters with respect to fairness. We demonstrate empirically that FairTune +leads to improved fairness on a range of medical imaging datasets. + +
+
+ comment: 9 pages, 2 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ RSPrompter: Learning to Prompt for Remote Sensing Instance Segmentation + based on Visual Foundation Model + + +
+ Leveraging the extensive training data from SA-1B, the Segment Anything Model +(SAM) demonstrates remarkable generalization and zero-shot capabilities. +However, as a category-agnostic instance segmentation method, SAM heavily +relies on prior manual guidance, including points, boxes, and coarse-grained +masks. Furthermore, its performance in remote sensing image segmentation tasks +remains largely unexplored and unproven. In this paper, we aim to develop an +automated instance segmentation approach for remote sensing images, based on +the foundational SAM model and incorporating semantic category information. +Drawing inspiration from prompt learning, we propose a method to learn the +generation of appropriate prompts for SAM. This enables SAM to produce +semantically discernible segmentation results for remote sensing images, a +concept we have termed RSPrompter. We also propose several ongoing derivatives +for instance segmentation tasks, drawing on recent advancements within the SAM +community, and compare their performance with RSPrompter. Extensive +experimental results, derived from the WHU building, NWPU VHR-10, and SSDD +datasets, validate the effectiveness of our proposed method. The code for our +method is publicly available at kychen.me/RSPrompter. + +
+
+
+
+
+ + ♻ ☆ Transform, Contrast and Tell: Coherent Entity-Aware Multi-Image + Captioning + + +
+ Coherent entity-aware multi-image captioning aims to generate coherent +captions for neighboring images in a news document. There are coherence +relationships among neighboring images because they often describe same +entities or events. These relationships are important for entity-aware +multi-image captioning, but are neglected in entity-aware single-image +captioning. Most existing work focuses on single-image captioning, while +multi-image captioning has not been explored before. Hence, this paper proposes +a coherent entity-aware multi-image captioning model by making use of coherence +relationships. The model consists of a Transformer-based caption generation +model and two types of contrastive learning-based coherence mechanisms. The +generation model generates the caption by paying attention to the image and the +accompanying text. The caption-caption coherence mechanism aims to render +entities in the caption of the image be also in captions of neighboring images. +The caption-image-text coherence mechanism aims to render entities in the +caption of the image be also in the accompanying text. To evaluate coherence +between captions, two coherence evaluation metrics are proposed. The new +dataset DM800K is constructed that has more images per document than two +existing datasets GoodNews and NYT800K, and is more suitable for multi-image +captioning. Experiments on three datasets show the proposed captioning model +outperforms 7 baselines according to BLUE, Rouge, METEOR, and entity precision +and recall scores. Experiments also show that the generated captions are more +coherent than that of baselines according to caption entity scores, caption +Rouge scores, the two proposed coherence evaluation metrics, and human +evaluations. + +
+
+ comment: 32 pages, 11 tables, 3 figures +
+
+
+
+
+ + ♻ ☆ Paragraph-to-Image Generation with Information-Enriched Diffusion Model + + +
+ Text-to-image (T2I) models have recently experienced rapid development, +achieving astonishing performance in terms of fidelity and textual alignment +capabilities. However, given a long paragraph (up to 512 words), these +generation models still struggle to achieve strong alignment and are unable to +generate images depicting complex scenes. In this paper, we introduce an +information-enriched diffusion model for paragraph-to-image generation task, +termed ParaDiffusion, which delves into the transference of the extensive +semantic comprehension capabilities of large language models to the task of +image generation. At its core is using a large language model (e.g., Llama V2) +to encode long-form text, followed by fine-tuning with LORA to alignthe +text-image feature spaces in the generation task. To facilitate the training of +long-text semantic alignment, we also curated a high-quality paragraph-image +pair dataset, namely ParaImage. This dataset contains a small amount of +high-quality, meticulously annotated data, and a large-scale synthetic dataset +with long text descriptions being generated using a vision-language model. +Experiments demonstrate that ParaDiffusion outperforms state-of-the-art models +(SD XL, DeepFloyd IF) on ViLG-300 and ParaPrompts, achieving up to 15% and 45% +human voting rate improvements for visual appeal and text faithfulness, +respectively. The code and dataset will be released to foster community +research on long-text alignment. + +
+
+ comment: The project website is at: + https://weijiawu.github.io/ParaDiffusionPage/. Code: + https://github.com/weijiawu/ParaDiffusion +
+
+
+
+
+ + ♻ ☆ Foundation Model for Endoscopy Video Analysis via Large-scale + Self-supervised Pre-train MICCAI 2023 + + +
+ Foundation models have exhibited remarkable success in various applications, +such as disease diagnosis and text report generation. To date, a foundation +model for endoscopic video analysis is still lacking. In this paper, we propose +Endo-FM, a foundation model specifically developed using massive endoscopic +video data. First, we build a video transformer, which captures both local and +global long-range dependencies across spatial and temporal dimensions. Second, +we pre-train our transformer model using global and local views via a +self-supervised manner, aiming to make it robust to spatial-temporal variations +and discriminative across different scenes. To develop the foundation model, we +construct a large-scale endoscopy video dataset by combining 9 publicly +available datasets and a privately collected dataset from Baoshan Branch of +Renji Hospital in Shanghai, China. Our dataset overall consists of over 33K +video clips with up to 5 million frames, encompassing various protocols, target +organs, and disease types. Our pre-trained Endo-FM can be easily adopted for a +given downstream task via fine-tuning by serving as the backbone. With +experiments on 3 different types of downstream tasks, including classification, +segmentation, and detection, our Endo-FM surpasses the current state-of-the-art +(SOTA) self-supervised pre-training and adapter-based transfer learning methods +by a significant margin, such as VCL (3.1% F1, 4.8% Dice, and 5.5% F1 for +classification, segmentation, and detection) and ST-Adapter (5.9% F1, 9.6% +Dice, and 9.9% F1 for classification, segmentation, and detection). Code, +datasets, and models are released at https://github.com/med-air/Endo-FM. + +
+
+ comment: MICCAI 2023 camera-ready version +
+
+
+
+
+ + ♻ ☆ LiveNVS: Neural View Synthesis on Live RGB-D Streams SIGGRAPH + + +
+ Existing real-time RGB-D reconstruction approaches, like Kinect Fusion, lack +real-time photo-realistic visualization. This is due to noisy, oversmoothed or +incomplete geometry and blurry textures which are fused from imperfect depth +maps and camera poses. Recent neural rendering methods can overcome many of +such artifacts but are mostly optimized for offline usage, hindering the +integration into a live reconstruction pipeline. + In this paper, we present LiveNVS, a system that allows for neural novel view +synthesis on a live RGB-D input stream with very low latency and real-time +rendering. Based on the RGB-D input stream, novel views are rendered by +projecting neural features into the target view via a densely fused depth map +and aggregating the features in image-space to a target feature map. A +generalizable neural network then translates the target feature map into a +high-quality RGB image. LiveNVS achieves state-of-the-art neural rendering +quality of unknown scenes during capturing, allowing users to virtually explore +the scene and assess reconstruction quality in real-time. + +
+
+ comment: main paper: 8 pages, total number of pages: 15, 13 figures, to be + published in SIGGRAPH Asia 2023 Conference Papers; edits: link was fixed +
+
+
+
+
+ + ♻ ☆ Distill Gold from Massive Ores: Efficient Dataset Distillation via + Critical Samples Selection + + +
+ Data-efficient learning has garnered significant attention, especially given +the current trend of large multi-modal models. Recently, dataset distillation +becomes an effective approach for data-efficiency; however, the distillation +process itself can still be inefficient. In this work, we model the dataset +distillation task within the context of information transport. By observing the +substantial data redundancy inherent in the distillation, we argue to put more +emphasis on the samples' utility for the distillation task. We introduce and +validate a family of data utility estimators and optimal data selection methods +to exploit the most valuable samples. This strategy significantly reduces the +training costs and extends various existing distillation algorithms to larger +and more diversified datasets, e.g., in some cases only 0.04% training data is +sufficient for comparable distillation performance. Our method consistently +enhances the distillation algorithms, even on much larger-scale and more +heterogeneous datasets, e.g. ImageNet-1K and Kinetics-400. This paradigm opens +up new avenues in the dynamics of distillation and paves the way for efficient +dataset distillation. Our code is available on +https://github.com/silicx/GoldFromOres . + +
+
+
+
+
+ + ♻ ☆ Training-Free Layout Control with Cross-Attention Guidance WACV 2024 + + +
+ Recent diffusion-based generators can produce high-quality images from +textual prompts. However, they often disregard textual instructions that +specify the spatial layout of the composition. We propose a simple approach +that achieves robust layout control without the need for training or +fine-tuning of the image generator. Our technique manipulates the +cross-attention layers that the model uses to interface textual and visual +information and steers the generation in the desired direction given, e.g., a +user-specified layout. To determine how to best guide attention, we study the +role of attention maps and explore two alternative strategies, forward and +backward guidance. We thoroughly evaluate our approach on three benchmarks and +provide several qualitative examples and a comparative analysis of the two +strategies that demonstrate the superiority of backward guidance compared to +forward guidance, as well as prior work. We further demonstrate the versatility +of layout guidance by extending it to applications such as editing the layout +and context of real images. + +
+
+ comment: WACV 2024, Project Page: + https://silent-chen.github.io/layout-guidance/ +
+
+
+
+
+ + ♻ ☆ CD-GAN: a robust fusion-based generative adversarial network for + unsupervised remote sensing change detection with heterogeneous sensors + + +
+ In the context of Earth observation, change detection boils down to comparing +images acquired at different times by sensors of possibly different spatial +and/or spectral resolutions or different modalities (e.g., optical or radar). +Even when considering only optical images, this task has proven to be +challenging as soon as the sensors differ by their spatial and/or spectral +resolutions. This paper proposes a novel unsupervised change detection method +dedicated to images acquired by such so-called heterogeneous optical sensors. +It capitalizes on recent advances which formulate the change detection task +into a robust fusion framework. Adopting this formulation, the work reported in +this paper shows that any off-the-shelf network trained beforehand to fuse +optical images of different spatial and/or spectral resolutions can be easily +complemented with a network of the same architecture and embedded into an +adversarial framework to perform change detection. A comparison with +state-of-the-art change detection methods demonstrates the versatility and the +effectiveness of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ Knockoffs-SPR: Clean Sample Selection in Learning with Noisy Labels + + +
+ A noisy training set usually leads to the degradation of the generalization +and robustness of neural networks. In this paper, we propose a novel +theoretically guaranteed clean sample selection framework for learning with +noisy labels. Specifically, we first present a Scalable Penalized Regression +(SPR) method, to model the linear relation between network features and one-hot +labels. In SPR, the clean data are identified by the zero mean-shift parameters +solved in the regression model. We theoretically show that SPR can recover +clean data under some conditions. Under general scenarios, the conditions may +be no longer satisfied; and some noisy data are falsely selected as clean data. +To solve this problem, we propose a data-adaptive method for Scalable Penalized +Regression with Knockoff filters (Knockoffs-SPR), which is provable to control +the False-Selection-Rate (FSR) in the selected clean data. To improve the +efficiency, we further present a split algorithm that divides the whole +training set into small pieces that can be solved in parallel to make the +framework scalable to large datasets. While Knockoffs-SPR can be regarded as a +sample selection module for a standard supervised training pipeline, we further +combine it with a semi-supervised algorithm to exploit the support of noisy +data as unlabeled data. Experimental results on several benchmark datasets and +real-world noisy datasets show the effectiveness of our framework and validate +the theoretical results of Knockoffs-SPR. Our code and pre-trained models are +available at https://github.com/Yikai-Wang/Knockoffs-SPR. + +
+
+ comment: update: final version, to appear in TPAMI +
+
+
+
+
+ + ♻ ☆ Unsupervised approaches based on optimal transport and convex analysis + for inverse problems in imaging + + +
+ Unsupervised deep learning approaches have recently become one of the crucial +research areas in imaging owing to their ability to learn expressive and +powerful reconstruction operators even when paired high-quality training data +is scarcely available. In this chapter, we review theoretically principled +unsupervised learning schemes for solving imaging inverse problems, with a +particular focus on methods rooted in optimal transport and convex analysis. We +begin by reviewing the optimal transport-based unsupervised approaches such as +the cycle-consistency-based models and learned adversarial regularization +methods, which have clear probabilistic interpretations. Subsequently, we give +an overview of a recent line of works on provably convergent learned +optimization algorithms applied to accelerate the solution of imaging inverse +problems, alongside their dedicated unsupervised training schemes. We also +survey a number of provably convergent plug-and-play algorithms (based on +gradient-step deep denoisers), which are among the most important and widely +applied unsupervised approaches for imaging problems. At the end of this +survey, we provide an overview of a few related unsupervised learning +frameworks that complement our focused schemes. Together with a detailed +survey, we provide an overview of the key mathematical results that underlie +the methods reviewed in the chapter to keep our discussion self-contained. + +
+
+
+
+
+ + ♻ ☆ SPColor: Semantic Prior Guided Exemplar-based Image Colorization + + +
+ Exemplar-based image colorization aims to colorize a target grayscale image +based on a color reference image, and the key is to establish accurate +pixel-level semantic correspondence between these two images. Previous methods +search for correspondence across the entire reference image, and this type of +global matching is easy to get mismatch. We summarize the difficulties in two +aspects: (1) When the reference image only contains a part of objects related +to target image, improper correspondence will be established in unrelated +regions. (2) It is prone to get mismatch in regions where the shape or texture +of the object is easily confused. To overcome these issues, we propose SPColor, +a semantic prior guided exemplar-based image colorization framework. Different +from previous methods, SPColor first coarsely classifies pixels of the +reference and target images to several pseudo-classes under the guidance of +semantic prior, then the correspondences are only established locally between +the pixels in the same class via the newly designed semantic prior guided +correspondence network. In this way, improper correspondence between different +semantic classes is explicitly excluded, and the mismatch is obviously +alleviated. Besides, to better reserve the color from reference, a similarity +masked perceptual loss is designed. Noting that the carefully designed SPColor +utilizes the semantic prior provided by an unsupervised segmentation model, +which is free for additional manual semantic annotations. Experiments +demonstrate that our model outperforms recent state-of-the-art methods both +quantitatively and qualitatively on public dataset. + +
+
+
+
+
+ + ♻ ☆ Hausdorff Distance Matching with Adaptive Query Denoising for Rotated + Detection Transformer + + +
+ The Detection Transformer (DETR) has emerged as a pivotal role in object +detection tasks, setting new performance benchmarks due to its end-to-end +design and scalability. Despite its advancements, the application of DETR in +detecting rotated objects has demonstrated suboptimal performance relative to +established oriented object detectors. Our analysis identifies a key +limitation: the L1 cost used in Hungarian Matching leads to duplicate +predictions due to the square-like problem in oriented object detection, +thereby obstructing the training process of the detector. We introduce a +Hausdorff distance-based cost for Hungarian matching, which more accurately +quantifies the discrepancy between predictions and ground truths. Moreover, we +note that a static denoising approach hampers the training of rotated DETR, +particularly when the detector's predictions surpass the quality of noised +ground truths. We propose an adaptive query denoising technique, employing +Hungarian matching to selectively filter out superfluous noised queries that no +longer contribute to model improvement. Our proposed modifications to DETR have +resulted in superior performance, surpassing previous rotated DETR models and +other alternatives. This is evidenced by our model's state-of-the-art +achievements in benchmarks such as DOTA-v1.0/v1.5/v2.0, and DIOR-R. + +
+
+ comment: Under review, 16 pages, 12 tables, 8 figures +
+
+
+
+
+ + ♻ ☆ ZoomNeXt: A Unified Collaborative Pyramid Network for Camouflaged Object + Detection + + +
+ Recent camouflaged object detection (COD) attempts to segment objects +visually blended into their surroundings, which is extremely complex and +difficult in real-world scenarios. Apart from the high intrinsic similarity +between camouflaged objects and their background, objects are usually diverse +in scale, fuzzy in appearance, and even severely occluded. To this end, we +propose an effective unified collaborative pyramid network which mimics human +behavior when observing vague images and videos, \textit{i.e.}, zooming in and +out. Specifically, our approach employs the zooming strategy to learn +discriminative mixed-scale semantics by the multi-head scale integration and +rich granularity perception units, which are designed to fully explore +imperceptible clues between candidate objects and background surroundings. The +former's intrinsic multi-head aggregation provides more diverse visual +patterns. The latter's routing mechanism can effectively propagate inter-frame +difference in spatiotemporal scenarios and adaptively ignore static +representations. They provides a solid foundation for realizing a unified +architecture for static and dynamic COD. Moreover, considering the uncertainty +and ambiguity derived from indistinguishable textures, we construct a simple +yet effective regularization, uncertainty awareness loss, to encourage +predictions with higher confidence in candidate regions. Our highly +task-friendly framework consistently outperforms existing state-of-the-art +methods in image and video COD benchmarks. The code will be available at +\url{https://github.com/lartpang/ZoomNeXt}. + +
+
+ comment: Extensions to the conference version: arXiv:2203.02688; Fixed some + word errors +
+
+
+
+
+ + ♻ ☆ Effective Quantization for Diffusion Models on CPUs + + +
+ Diffusion models have gained popularity for generating images from textual +descriptions. Nonetheless, the substantial need for computational resources +continues to present a noteworthy challenge, contributing to time-consuming +processes. Quantization, a technique employed to compress deep learning models +for enhanced efficiency, presents challenges when applied to diffusion models. +These models are notably more sensitive to quantization compared to other model +types, potentially resulting in a degradation of image quality. In this paper, +we introduce a novel approach to quantize the diffusion models by leveraging +both quantization-aware training and distillation. Our results show the +quantized models can maintain the high image quality while demonstrating the +inference efficiency on CPUs. The code is publicly available at: +https://github.com/intel/intel-extension-for-transformers. + +
+
+
+
+
+ + ♻ ☆ NeISF: Neural Incident Stokes Field for Geometry and Material Estimation + + +
+ Multi-view inverse rendering is the problem of estimating the scene +parameters such as shapes, materials, or illuminations from a sequence of +images captured under different viewpoints. Many approaches, however, assume +single light bounce and thus fail to recover challenging scenarios like +inter-reflections. On the other hand, simply extending those methods to +consider multi-bounced light requires more assumptions to alleviate the +ambiguity. To address this problem, we propose Neural Incident Stokes Fields +(NeISF), a multi-view inverse rendering framework that reduces ambiguities +using polarization cues. The primary motivation for using polarization cues is +that it is the accumulation of multi-bounced light, providing rich information +about geometry and material. Based on this knowledge, the proposed incident +Stokes field efficiently models the accumulated polarization effect with the +aid of an original physically-based differentiable polarimetric renderer. +Lastly, experimental results show that our method outperforms the existing +works in synthetic and real scenarios. + +
+
+
+
+
+ + ♻ ☆ SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with + Large Language Models ACM MM 2023 + + +
+ Diffusion models, which have emerged to become popular text-to-image +generation models, can produce high-quality and content-rich images guided by +textual prompts. However, there are limitations to semantic understanding and +commonsense reasoning in existing models when the input prompts are concise +narrative, resulting in low-quality image generation. To improve the capacities +for narrative prompts, we propose a simple-yet-effective parameter-efficient +fine-tuning approach called the Semantic Understanding and Reasoning adapter +(SUR-adapter) for pre-trained diffusion models. To reach this goal, we first +collect and annotate a new dataset SURD which consists of more than 57,000 +semantically corrected multi-modal samples. Each sample contains a simple +narrative prompt, a complex keyword-based prompt, and a high-quality image. +Then, we align the semantic representation of narrative prompts to the complex +prompts and transfer knowledge of large language models (LLMs) to our +SUR-adapter via knowledge distillation so that it can acquire the powerful +semantic understanding and reasoning capabilities to build a high-quality +textual semantic representation for text-to-image generation. We conduct +experiments by integrating multiple LLMs and popular pre-trained diffusion +models to show the effectiveness of our approach in enabling diffusion models +to understand and reason concise natural language without image quality +degradation. Our approach can make text-to-image diffusion models easier to use +with better user experience, which demonstrates our approach has the potential +for further advancing the development of user-friendly text-to-image generation +models by bridging the semantic gap between simple narrative prompts and +complex keyword-based prompts. The code is released at +https://github.com/Qrange-group/SUR-adapter. + +
+
+ comment: accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ DeepEMplanner: An End-to-End EM Motion Planner with Iterative + Interactions + + +
+ Motion planning is a computational problem that finds a sequence of valid +trajectories, often based on surrounding agents' forecasting, environmental +understanding, and historical and future contexts. It can also be viewed as a +game in which agents continuously plan their next move according to other +agents' intentions and the encountering environment, further achieving their +ultimate goals through incremental actions. To model the dynamic planning and +interaction process, we propose a novel framework, DeepEMplanner, which takes +the stepwise interaction into account for fine-grained behavior learning. The +ego vehicle maximizes each step motion to reach its eventual driving outcome +based on the stepwise expectation from agents and its upcoming road conditions. +On the other hand, the agents also follow the same philosophy to maximize their +stepwise behavior under the encountering environment and the expectations from +ego and other agents. Our DeepEMplanner models the interactions among ego, +agents, and the dynamic environment in an autoregressive manner by interleaving +the Expectation and Maximization processes. Further, we design ego-to-agents, +ego-to-map, and ego-to-BEV interaction mechanisms with hierarchical dynamic key +objects attention to better model the interactions. Experiments on the nuScenes +benchmark show that our approach achieves state-of-the-art results. + +
+
+
+
+
+ + ♻ ☆ DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via + Multi-Modal Causal Attention + + +
+ Most of the existing multi-modal models, hindered by their incapacity to +adeptly manage interleaved image-and-text inputs in multi-image, multi-round +dialogues, face substantial constraints in resource allocation for training and +data accessibility, impacting their adaptability and scalability across varied +interaction realms. To address this, we present the DeepSpeed-VisualChat +framework, designed to optimize Large Language Models (LLMs) by incorporating +multi-modal capabilities, with a focus on enhancing the proficiency of Large +Vision and Language Models in handling interleaved inputs. Our framework is +notable for (1) its open-source support for multi-round and multi-image +dialogues, (2) introducing an innovative multi-modal causal attention +mechanism, and (3) utilizing data blending techniques on existing datasets to +assure seamless interactions in multi-round, multi-image conversations. +Compared to existing frameworks, DeepSpeed-VisualChat shows superior +scalability up to 70B parameter language model size, representing a significant +advancement in multi-modal language models and setting a solid foundation for +future explorations. + +
+
+
+
+
+ + ♻ ☆ Image Clustering Conditioned on Text Criteria + + +
+ Classical clustering methods do not provide users with direct control of the +clustering results, and the clustering results may not be consistent with the +relevant criterion that a user has in mind. In this work, we present a new +methodology for performing image clustering based on user-specified text +criteria by leveraging modern vision-language models and large language models. +We call our method Image Clustering Conditioned on Text Criteria (IC|TC), and +it represents a different paradigm of image clustering. IC|TC requires a +minimal and practical degree of human intervention and grants the user +significant control over the clustering results in return. Our experiments show +that IC|TC can effectively cluster images with various criteria, such as human +action, physical location, or the person's mood, while significantly +outperforming baselines. + +
+
+
+
+
+ + ♻ ☆ Learning Stackable and Skippable LEGO Bricks for Efficient, + Reconfigurable, and Variable-Resolution Diffusion Modeling + + +
+ Diffusion models excel at generating photo-realistic images but come with +significant computational costs in both training and sampling. While various +techniques address these computational challenges, a less-explored issue is +designing an efficient and adaptable network backbone for iterative refinement. +Current options like U-Net and Vision Transformer often rely on +resource-intensive deep networks and lack the flexibility needed for generating +images at variable resolutions or with a smaller network than used in training. +This study introduces LEGO bricks, which seamlessly integrate Local-feature +Enrichment and Global-content Orchestration. These bricks can be stacked to +create a test-time reconfigurable diffusion backbone, allowing selective +skipping of bricks to reduce sampling costs and generate higher-resolution +images than the training data. LEGO bricks enrich local regions with an MLP and +transform them using a Transformer block while maintaining a consistent +full-resolution image across all bricks. Experimental results demonstrate that +LEGO bricks enhance training efficiency, expedite convergence, and facilitate +variable-resolution image generation while maintaining strong generative +performance. Moreover, LEGO significantly reduces sampling time compared to +other methods, establishing it as a valuable enhancement for diffusion models. + +
+
+
+
+
+ + ♻ ☆ Image Blending Algorithm with Automatic Mask Generation + + +
+ In recent years, image blending has gained popularity for its ability to +create visually stunning content. However, the current image blending +algorithms mainly have the following problems: manually creating image blending +masks requires a lot of manpower and material resources; image blending +algorithms cannot effectively solve the problems of brightness distortion and +low resolution. To this end, we propose a new image blending method with +automatic mask generation: it combines semantic object detection and +segmentation with mask generation to achieve deep blended images based on our +proposed new saturation loss and two-stage iteration of the PAN algorithm to +fix brightness distortion and low-resolution issues. Results on publicly +available datasets show that our method outperforms other classical image +blending algorithms on various performance metrics, including PSNR and SSIM. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Meta-Learning with a Geometry-Adaptive Preconditioner CVPR 2023 + + +
+ Model-agnostic meta-learning (MAML) is one of the most successful +meta-learning algorithms. It has a bi-level optimization structure where the +outer-loop process learns a shared initialization and the inner-loop process +optimizes task-specific weights. Although MAML relies on the standard gradient +descent in the inner-loop, recent studies have shown that controlling the +inner-loop's gradient descent with a meta-learned preconditioner can be +beneficial. Existing preconditioners, however, cannot simultaneously adapt in a +task-specific and path-dependent way. Additionally, they do not satisfy the +Riemannian metric condition, which can enable the steepest descent learning +with preconditioned gradient. In this study, we propose Geometry-Adaptive +Preconditioned gradient descent (GAP) that can overcome the limitations in +MAML; GAP can efficiently meta-learn a preconditioner that is dependent on +task-specific parameters, and its preconditioner can be shown to be a +Riemannian metric. Thanks to the two properties, the geometry-adaptive +preconditioner is effective for improving the inner-loop optimization. +Experiment results show that GAP outperforms the state-of-the-art MAML family +and preconditioned gradient descent-MAML (PGD-MAML) family in a variety of +few-shot learning tasks. Code is available at: +https://github.com/Suhyun777/CVPR23-GAP. + +
+
+ comment: Accepted at CVPR 2023. Code is available at: + https://github.com/Suhyun777/CVPR23-GAP; This is an extended version of our + previous CVPR23 work +
+
+
+
+
+ + ♻ ☆ Sketch-based Video Object Localization WACV 2024 + + +
+ We introduce Sketch-based Video Object Localization (SVOL), a new task aimed +at localizing spatio-temporal object boxes in video queried by the input +sketch. We first outline the challenges in the SVOL task and build the +Sketch-Video Attention Network (SVANet) with the following design principles: +(i) to consider temporal information of video and bridge the domain gap between +sketch and video; (ii) to accurately identify and localize multiple objects +simultaneously; (iii) to handle various styles of sketches; (iv) to be +classification-free. In particular, SVANet is equipped with a Cross-modal +Transformer that models the interaction between learnable object tokens, query +sketch, and video through attention operations, and learns upon a per-frame set +matching strategy that enables frame-wise prediction while utilizing global +video context. We evaluate SVANet on a newly curated SVOL dataset. By design, +SVANet successfully learns the mapping between the query sketches and video +objects, achieving state-of-the-art results on the SVOL benchmark. We further +confirm the effectiveness of SVANet via extensive ablation studies and +visualizations. Lastly, we demonstrate its transfer capability on unseen +datasets and novel categories, suggesting its high scalability in real-world +applications. + +
+
+ comment: WACV 2024; Code: https://github.com/sangminwoo/SVOL +
+
+
+
+
+ + ♻ ☆ Exo2EgoDVC: Dense Video Captioning of Egocentric Procedural Activities + Using Web Instructional Videos + + +
+ We propose a novel benchmark for cross-view knowledge transfer of dense video +captioning, adapting models from web instructional videos with exocentric views +to an egocentric view. While dense video captioning (predicting time segments +and their captions) is primarily studied with exocentric videos (e.g., +YouCook2), benchmarks with egocentric videos are restricted due to data +scarcity. To overcome the limited video availability, transferring knowledge +from abundant exocentric web videos is demanded as a practical approach. +However, learning the correspondence between exocentric and egocentric views is +difficult due to their dynamic view changes. The web videos contain mixed views +focusing on either human body actions or close-up hand-object interactions, +while the egocentric view is constantly shifting as the camera wearer moves. +This necessitates the in-depth study of cross-view transfer under complex view +changes. In this work, we first create a real-life egocentric dataset (EgoYC2) +whose captions are shared with YouCook2, enabling transfer learning between +these datasets assuming their ground-truth is accessible. To bridge the view +gaps, we propose a view-invariant learning method using adversarial training in +both the pre-training and fine-tuning stages. While the pre-training is +designed to learn invariant features against the mixed views in the web videos, +the view-invariant fine-tuning further mitigates the view gaps between both +datasets. We validate our proposed method by studying how effectively it +overcomes the view change problem and efficiently transfers the knowledge to +the egocentric domain. Our benchmark pushes the study of the cross-view +transfer into a new task domain of dense video captioning and will envision +methodologies to describe egocentric videos in natural language. + +
+
+
+
+
+ + ♻ ☆ Enhancing Adversarial Attacks: The Similar Target Method + + +
+ Deep neural networks are vulnerable to adversarial examples, posing a threat +to the models' applications and raising security concerns. An intriguing +property of adversarial examples is their strong transferability. Several +methods have been proposed to enhance transferability, including ensemble +attacks which have demonstrated their efficacy. However, prior approaches +simply average logits, probabilities, or losses for model ensembling, lacking a +comprehensive analysis of how and why model ensembling significantly improves +transferability. In this paper, we propose a similar targeted attack method +named Similar Target~(ST). By promoting cosine similarity between the gradients +of each model, our method regularizes the optimization direction to +simultaneously attack all surrogate models. This strategy has been proven to +enhance generalization ability. Experimental results on ImageNet validate the +effectiveness of our approach in improving adversarial transferability. Our +method outperforms state-of-the-art attackers on 18 discriminative classifiers +and adversarially trained models. + +
+
+
+
+
+ + ♻ ☆ UFOGen: You Forward Once Large Scale Text-to-Image Generation via + Diffusion GANs + + +
+ Text-to-image diffusion models have demonstrated remarkable capabilities in +transforming textual prompts into coherent images, yet the computational cost +of their inference remains a persistent challenge. To address this issue, we +present UFOGen, a novel generative model designed for ultra-fast, one-step +text-to-image synthesis. In contrast to conventional approaches that focus on +improving samplers or employing distillation techniques for diffusion models, +UFOGen adopts a hybrid methodology, integrating diffusion models with a GAN +objective. Leveraging a newly introduced diffusion-GAN objective and +initialization with pre-trained diffusion models, UFOGen excels in efficiently +generating high-quality images conditioned on textual descriptions in a single +step. Beyond traditional text-to-image generation, UFOGen showcases versatility +in applications. Notably, UFOGen stands among the pioneering models enabling +one-step text-to-image generation and diverse downstream tasks, presenting a +significant advancement in the landscape of efficient generative models. + +
+
+
+
+
+ + ♻ ☆ Learning Spatial Features from Audio-Visual Correspondence in Egocentric + Videos + + +
+ We propose a self-supervised method for learning representations based on +spatial audio-visual correspondences in egocentric videos. Our method uses a +masked auto-encoding framework to synthesize masked binaural (multi-channel) +audio through the synergy of audio and vision, thereby learning useful spatial +relationships between the two modalities. We use our pretrained features to +tackle two downstream video tasks requiring spatial understanding in social +scenarios: active speaker detection and spatial audio denoising. Through +extensive experiments, we show that our features are generic enough to improve +over multiple state-of-the-art baselines on both tasks on two challenging +egocentric video datasets that offer binaural audio, EgoCom and EasyCom. +Project: http://vision.cs.utexas.edu/projects/ego_av_corr. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Relationships: A New Perspective to Enhance Scene Graph + Generation NeurIPS 2023 + + +
+ This paper presents a finding that leveraging the hierarchical structures +among labels for relationships and objects can substantially improve the +performance of scene graph generation systems. The focus of this work is to +create an informative hierarchical structure that can divide object and +relationship categories into disjoint super-categories in a systematic way. +Specifically, we introduce a Bayesian prediction head to jointly predict the +super-category of relationships between a pair of object instances, as well as +the detailed relationship within that super-category simultaneously, +facilitating more informative predictions. The resulting model exhibits the +capability to produce a more extensive set of predicates beyond the dataset +annotations, and to tackle the prevalent issue of low annotation quality. While +our paper presents preliminary findings, experiments on the Visual Genome +dataset show its strong performance, particularly in predicate classifications +and zero-shot settings, that demonstrates the promise of our approach. + +
+
+ comment: NeurIPS 2023 New Frontiers in Graph Learning Workshop (NeurIPS + GLFrontiers 2023); NeurIPS 2023 Queer in AI Workshop. This paper is a + preliminary work of the full paper available at arXiv:2311.12889 +
+
+
+
+
+ + ♻ ☆ Meaning Representations from Trajectories in Autoregressive Models + + +
+ We propose to extract meaning representations from autoregressive language +models by considering the distribution of all possible trajectories extending +an input text. This strategy is prompt-free, does not require fine-tuning, and +is applicable to any pre-trained autoregressive model. Moreover, unlike +vector-based representations, distribution-based representations can also model +asymmetric relations (e.g., direction of logical entailment, hypernym/hyponym +relations) by using algebraic operations between likelihood functions. These +ideas are grounded in distributional perspectives on semantics and are +connected to standard constructions in automata theory, but to our knowledge +they have not been applied to modern language models. We empirically show that +the representations obtained from large models align well with human +annotations, outperform other zero-shot and prompt-free methods on semantic +similarity tasks, and can be used to solve more complex entailment and +containment tasks that standard embeddings cannot handle. Finally, we extend +our method to represent data from different modalities (e.g., image and text) +using multimodal autoregressive models. Our code is available at: +https://github.com/tianyu139/meaning-as-trajectories + +
+
+
+
+
+ + ♻ ☆ AnyLoc: Towards Universal Visual Place Recognition ICRA 2024 + + +
+ Visual Place Recognition (VPR) is vital for robot localization. To date, the +most performant VPR approaches are environment- and task-specific: while they +exhibit strong performance in structured environments (predominantly urban +driving), their performance degrades severely in unstructured environments, +rendering most approaches brittle to robust real-world deployment. In this +work, we develop a universal solution to VPR -- a technique that works across a +broad range of structured and unstructured environments (urban, outdoors, +indoors, aerial, underwater, and subterranean environments) without any +re-training or fine-tuning. We demonstrate that general-purpose feature +representations derived from off-the-shelf self-supervised models with no +VPR-specific training are the right substrate upon which to build such a +universal VPR solution. Combining these derived features with unsupervised +feature aggregation enables our suite of methods, AnyLoc, to achieve up to 4X +significantly higher performance than existing approaches. We further obtain a +6% improvement in performance by characterizing the semantic properties of +these features, uncovering unique domains which encapsulate datasets from +similar environments. Our detailed experiments and analysis lay a foundation +for building VPR solutions that may be deployed anywhere, anytime, and across +anyview. We encourage the readers to explore our project page and interactive +demos: https://anyloc.github.io/. + +
+
+ comment: IEEE RA-L 2023 (Presented at ICRA 2024) +
+
+
+
+
+ + ♻ ☆ Exploring the Relationship between Samples and Masks for Robust Defect + Localization + + +
+ Defect detection aims to detect and localize regions out of the normal +distribution.Previous approaches model normality and compare it with the input +to identify defective regions, potentially limiting their generalizability.This +paper proposes a one-stage framework that detects defective patterns directly +without the modeling process.This ability is adopted through the joint efforts +of three parties: a generative adversarial network (GAN), a newly proposed +scaled pattern loss, and a dynamic masked cycle-consistent auxiliary network. +Explicit information that could indicate the position of defects is +intentionally excluded to avoid learning any direct mapping.Experimental +results on the texture class of the challenging MVTec AD dataset show that the +proposed method is 2.9% higher than the SOTA methods in F1-Score, while +substantially outperforming SOTA methods in generalizability. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Self-Supervised Learning for MRI Reconstruction + + +
+ Deep learning (DL) has emerged as a powerful tool for accelerated MRI +reconstruction, but often necessitates a database of fully-sampled measurements +for training. Recent self-supervised and unsupervised learning approaches +enable training without fully-sampled data. However, a database of undersampled +measurements may not be available in many scenarios, especially for scans +involving contrast or translational acquisitions in development. Moreover, +recent studies show that database-trained models may not generalize well when +the unseen measurements differ in terms of sampling pattern, acceleration rate, +SNR, image contrast, and anatomy. Such challenges necessitate a new methodology +to enable subject-specific DL MRI reconstruction without external training +datasets, since it is clinically imperative to provide high-quality +reconstructions that can be used to identify lesions/disease for \emph{every +individual}. In this work, we propose a zero-shot self-supervised learning +approach to perform subject-specific accelerated DL MRI reconstruction to +tackle these issues. The proposed approach partitions the available +measurements from a single scan into three disjoint sets. Two of these sets are +used to enforce data consistency and define loss during training for +self-supervision, while the last set serves to self-validate, establishing an +early stopping criterion. In the presence of models pre-trained on a database +with different image characteristics, we show that the proposed approach can be +combined with transfer learning for faster convergence time and reduced +computational complexity. The code is available at +\url{https://github.com/byaman14/ZS-SSL}. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Augmentation Framework for Anomaly Detection + + +
+ Data augmentation methods are commonly integrated into the training of +anomaly detection models. Previous approaches have primarily focused on +replicating real-world anomalies or enhancing diversity, without considering +that the standard of anomaly varies across different classes, potentially +leading to a biased training distribution.This paper analyzes crucial traits of +simulated anomalies that contribute to the training of reconstructive networks +and condenses them into several methods, thus creating a comprehensive +framework by selectively utilizing appropriate combinations.Furthermore, we +integrate this framework with a reconstruction-based approach and concurrently +propose a split training strategy that alleviates the issue of overfitting +while avoiding introducing interference to the reconstruction process. The +evaluations conducted on the MVTec anomaly detection dataset demonstrate that +our method outperforms the previous state-of-the-art approach, particularly in +terms of object classes. To evaluate generalizability, we generate a simulated +dataset comprising anomalies with diverse characteristics since the original +test samples only include specific types of anomalies and may lead to biased +evaluations. Experimental results demonstrate that our approach exhibits +promising potential for generalizing effectively to various unforeseen +anomalies encountered in real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ Multi-3D-Models Registration-Based Augmented Reality (AR) Instructions + for Assembly + + +
+ This paper introduces a novel, markerless, step-by-step, in-situ 3D Augmented +Reality (AR) instruction method and its application - BRICKxAR (Multi 3D +Models/M3D) - for small parts assembly. BRICKxAR (M3D) realistically visualizes +rendered 3D assembly parts at the assembly location of the physical assembly +model (Figure 1). The user controls the assembly process through a user +interface. BRICKxAR (M3D) utilizes deep learning-trained 3D model-based +registration. Object recognition and tracking become challenging as the +assembly model updates at each step. Additionally, not every part in a 3D +assembly may be visible to the camera during the assembly. BRICKxAR (M3D) +combines multiple assembly phases with a step count to address these +challenges. Thus, using fewer phases simplifies the complex assembly process +while step count facilitates accurate object recognition and precise +visualization of each step. A testing and heuristic evaluation of the BRICKxAR +(M3D) prototype and qualitative analysis were conducted with users and experts +in visualization and human-computer interaction. Providing robust 3D AR +instructions and allowing the handling of the assembly model, BRICKxAR (M3D) +has the potential to be used at different scales ranging from manufacturing +assembly to construction. + +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ $Q_{bias}$ -- A Dataset on Media Bias in Search Queries and Query + Suggestions + + +
+ This publication describes the motivation and generation of $Q_{bias}$, a +large dataset of Google and Bing search queries, a scraping tool and dataset +for biased news articles, as well as language models for the investigation of +bias in online search. Web search engines are a major factor and trusted source +in information search, especially in the political domain. However, biased +information can influence opinion formation and lead to biased opinions. To +interact with search engines, users formulate search queries and interact with +search query suggestions provided by the search engines. A lack of datasets on +search queries inhibits research on the subject. We use $Q_{bias}$ to evaluate +different approaches to fine-tuning transformer-based language models with the +goal of producing models capable of biasing text with left and right political +stance. Additionally to this work we provided datasets and language models for +biasing texts that allow further research on bias in online information search. + +
+
+ comment: Paper accepted at ACM Web Science Conference 2023. 6 pages +
+
+
+
+
+ + ☆ Creator Context for Tweet Recommendation + + +
+ When discussing a tweet, people usually not only refer to the content it +delivers, but also to the person behind the tweet. In other words, grounding +the interpretation of the tweet in the context of its creator plays an +important role in deciphering the true intent and the importance of the tweet. + In this paper, we attempt to answer the question of how creator context +should be used to advance tweet understanding. Specifically, we investigate the +usefulness of different types of creator context, and examine different model +structures for incorporating creator context in tweet modeling. We evaluate our +tweet understanding models on a practical use case -- recommending relevant +tweets to news articles. This use case already exists in popular news apps, and +can also serve as a useful assistive tool for journalists. We discover that +creator context is essential for tweet understanding, and can improve +application metrics by a large margin. However, we also observe that not all +creator contexts are equal. Creator context can be time sensitive and noisy. +Careful creator context selection and deliberate model structure design play an +important role in creator context effectiveness. + +
+
+
+
+
+ + ☆ Attribute Simulation for Item Embedding Enhancement in Multi-interest + Recommendation WSDM 2024 + + +
+ Although multi-interest recommenders have achieved significant progress in +the matching stage, our research reveals that existing models tend to exhibit +an under-clustered item embedding space, which leads to a low discernibility +between items and hampers item retrieval. This highlights the necessity for +item embedding enhancement. However, item attributes, which serve as effective +and straightforward side information for enhancement, are either unavailable or +incomplete in many public datasets due to the labor-intensive nature of manual +annotation tasks. This dilemma raises two meaningful questions: 1. Can we +bypass manual annotation and directly simulate complete attribute information +from the interaction data? And 2. If feasible, how to simulate attributes with +high accuracy and low complexity in the matching stage? + In this paper, we first establish an inspiring theoretical feasibility that +the item-attribute correlation matrix can be approximated through elementary +transformations on the item co-occurrence matrix. Then based on formula +derivation, we propose a simple yet effective module, SimEmb (Item Embedding +Enhancement via Simulated Attribute), in the multi-interest recommendation of +the matching stage to implement our findings. By simulating attributes with the +co-occurrence matrix, SimEmb discards the item ID-based embedding and employs +the attribute-weighted summation for item embedding enhancement. Comprehensive +experiments on four benchmark datasets demonstrate that our approach notably +enhances the clustering of item embedding and significantly outperforms SOTA +models with an average improvement of 25.59% on Recall@20. + +
+
+ comment: This paper has been accepted by the 17th ACM International Conference + on Web Search and Data Mining (WSDM 2024). The camera-ready version will be + available in the conference proceedings +
+
+
+
+
+ + ☆ AnonPSI: An Anonymity Assessment Framework for PSI + + +
+ Private Set Intersection (PSI) is a widely used protocol that enables two +parties to securely compute a function over the intersected part of their +shared datasets and has been a significant research focus over the years. +However, recent studies have highlighted its vulnerability to Set Membership +Inference Attacks (SMIA), where an adversary might deduce an individual's +membership by invoking multiple PSI protocols. This presents a considerable +risk, even in the most stringent versions of PSI, which only return the +cardinality of the intersection. This paper explores the evaluation of +anonymity within the PSI context. Initially, we highlight the reasons why +existing works fall short in measuring privacy leakage, and subsequently +propose two attack strategies that address these deficiencies. Furthermore, we +provide theoretical guarantees on the performance of our proposed methods. In +addition to these, we illustrate how the integration of auxiliary information, +such as the sum of payloads associated with members of the intersection +(PSI-SUM), can enhance attack efficiency. We conducted a comprehensive +performance evaluation of various attack strategies proposed utilizing two real +datasets. Our findings indicate that the methods we propose markedly enhance +attack efficiency when contrasted with previous research endeavors. {The +effective attacking implies that depending solely on existing PSI protocols may +not provide an adequate level of privacy assurance. It is recommended to +combine privacy-enhancing technologies synergistically to enhance privacy +protection even further. + +
+
+
+
+
+ + ♻ ☆ LM-Cocktail: Resilient Tuning of Language Models via Model Merging + + +
+ The pre-trained language models are continually fine-tuned to better support +downstream applications. However, this operation may result in significant +performance degeneration on general tasks beyond the targeted domain. To +overcome this problem, we propose LM-Cocktail which enables the fine-tuned +model to stay resilient in general perspectives. Our method is conducted in the +form of model merging, where the fine-tuned language model is merged with the +pre-trained base model or the peer models from other domains through weighted +average. Despite simplicity, LM-Cocktail is surprisingly effective: the +resulted model is able to achieve a strong empirical performance in the whole +scope of general tasks while preserving a superior capacity in its targeted +domain. We conduct comprehensive experiments with LLama and BGE model on +popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the +efficacy of our proposed method. The code and checkpoints are available at +https://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail. + +
+
+
+
+
+ + ♻ ☆ DiskANN++: Efficient Page-based Search over Isomorphic Mapped Graph + Index using Query-sensitivity Entry Vertex + + +
+ Given a vector dataset $\mathcal{X}$ and a query vector $\vec{x}_q$, +graph-based Approximate Nearest Neighbor Search (ANNS) aims to build a graph +index $G$ and approximately return vectors with minimum distances to +$\vec{x}_q$ by searching over $G$. The main drawback of graph-based ANNS is +that a graph index would be too large to fit into the memory especially for a +large-scale $\mathcal{X}$. To solve this, a Product Quantization (PQ)-based +hybrid method called DiskANN is proposed to store a low-dimensional PQ index in +memory and retain a graph index in SSD, thus reducing memory overhead while +ensuring a high search accuracy. However, it suffers from two I/O issues that +significantly affect the overall efficiency: (1) long routing path from an +entry vertex to the query's neighborhood that results in large number of I/O +requests and (2) redundant I/O requests during the routing process. We propose +an optimized DiskANN++ to overcome above issues. Specifically, for the first +issue, we present a query-sensitive entry vertex selection strategy to replace +DiskANN's static graph-central entry vertex by a dynamically determined entry +vertex that is close to the query. For the second I/O issue, we present an +isomorphic mapping on DiskANN's graph index to optimize the SSD layout and +propose an asynchronously optimized Pagesearch based on the optimized SSD +layout as an alternative to DiskANN's beamsearch. Comprehensive experimental +studies on eight real-world datasets demonstrate our DiskANN++'s superiority on +efficiency. We achieve a notable 1.5 X to 2.2 X improvement on QPS compared to +DiskANN, given the same accuracy constraint. + +
+
+ comment: 15 pages including references +
+
+
+
+
+ + ♻ ☆ FASER: Binary Code Similarity Search through the use of Intermediate + Representations + + +
+ Being able to identify functions of interest in cross-architecture software +is useful whether you are analysing for malware, securing the software supply +chain or conducting vulnerability research. Cross-Architecture Binary Code +Similarity Search has been explored in numerous studies and has used a wide +range of different data sources to achieve its goals. The data sources +typically used draw on common structures derived from binaries such as function +control flow graphs or binary level call graphs, the output of the disassembly +process or the outputs of a dynamic analysis approach. One data source which +has received less attention is binary intermediate representations. Binary +Intermediate representations possess two interesting properties: they are cross +architecture by their very nature and encode the semantics of a function +explicitly to support downstream usage. Within this paper we propose Function +as a String Encoded Representation (FASER) which combines long document +transformers with the use of intermediate representations to create a model +capable of cross architecture function search without the need for manual +feature engineering, pre-training or a dynamic analysis step. We compare our +approach against a series of baseline approaches for two tasks; A general +function search task and a targeted vulnerability search task. Our approach +demonstrates strong performance across both tasks, performing better than all +baseline approaches. + +
+
+ comment: 10 pages, Proceedings of the Conference on Applied Machine Learning + in Information Security (CAMLIS) +
+
+
+
+
+ + ♻ ☆ LLMRec: Large Language Models with Graph Augmentation for Recommendation WSDM 2024 + + +
+ The problem of data sparsity has long been a challenge in recommendation +systems, and previous studies have attempted to address this issue by +incorporating side information. However, this approach often introduces side +effects such as noise, availability issues, and low data quality, which in turn +hinder the accurate modeling of user preferences and adversely impact +recommendation performance. In light of the recent advancements in large +language models (LLMs), which possess extensive knowledge bases and strong +reasoning capabilities, we propose a novel framework called LLMRec that +enhances recommender systems by employing three simple yet effective LLM-based +graph augmentation strategies. Our approach leverages the rich content +available within online platforms (e.g., Netflix, MovieLens) to augment the +interaction graph in three ways: (i) reinforcing user-item interaction egde, +(ii) enhancing the understanding of item node attributes, and (iii) conducting +user node profiling, intuitively from the natural language perspective. By +employing these strategies, we address the challenges posed by sparse implicit +feedback and low-quality side information in recommenders. Besides, to ensure +the quality of the augmentation, we develop a denoised data robustification +mechanism that includes techniques of noisy implicit feedback pruning and +MAE-based feature enhancement that help refine the augmented data and improve +its reliability. Furthermore, we provide theoretical analysis to support the +effectiveness of LLMRec and clarify the benefits of our method in facilitating +model optimization. Experimental results on benchmark datasets demonstrate the +superiority of our LLM-based augmentation approach over state-of-the-art +techniques. To ensure reproducibility, we have made our code and augmented data +publicly available at: https://github.com/HKUDS/LLMRec.git + +
+
+ comment: WSDM 2024 Oral Presentation +
+
+
+
+
+ + ♻ ☆ Exploring the Carbon Footprint of Hugging Face's ML Models: A Repository + Mining Study + + +
+ The rise of machine learning (ML) systems has exacerbated their carbon +footprint due to increased capabilities and model sizes. However, there is +scarce knowledge on how the carbon footprint of ML models is actually measured, +reported, and evaluated. In light of this, the paper aims to analyze the +measurement of the carbon footprint of 1,417 ML models and associated datasets +on Hugging Face, which is the most popular repository for pretrained ML models. +The goal is to provide insights and recommendations on how to report and +optimize the carbon efficiency of ML models. The study includes the first +repository mining study on the Hugging Face Hub API on carbon emissions. This +study seeks to answer two research questions: (1) how do ML model creators +measure and report carbon emissions on Hugging Face Hub?, and (2) what aspects +impact the carbon emissions of training ML models? The study yielded several +key findings. These include a stalled proportion of carbon emissions-reporting +models, a slight decrease in reported carbon footprint on Hugging Face over the +past 2 years, and a continued dominance of NLP as the main application domain. +Furthermore, the study uncovers correlations between carbon emissions and +various attributes such as model size, dataset size, and ML application +domains. These results highlight the need for software measurements to improve +energy reporting practices and promote carbon-efficient model development +within the Hugging Face community. In response to this issue, two +classifications are proposed: one for categorizing models based on their carbon +emission reporting practices and another for their carbon efficiency. The aim +of these classification proposals is to foster transparency and sustainable +model development within the ML community. + +
+
+ comment: Accepted at the 2023 ACM/IEEE International Symposium on Empirical + Software Engineering and Measurement (ESEM) +
+
+
+
+
+
+
+
+ + Machine Learning 124 + +
+
+
+ + ☆ SODA: Bottleneck Diffusion Models for Representation Learning + + +
+ We introduce SODA, a self-supervised diffusion model, designed for +representation learning. The model incorporates an image encoder, which +distills a source view into a compact representation, that, in turn, guides the +generation of related novel views. We show that by imposing a tight bottleneck +between the encoder and a denoising decoder, and leveraging novel view +synthesis as a self-supervised objective, we can turn diffusion models into +strong representation learners, capable of capturing visual semantics in an +unsupervised manner. To the best of our knowledge, SODA is the first diffusion +model to succeed at ImageNet linear-probe classification, and, at the same +time, it accomplishes reconstruction, editing and synthesis tasks across a wide +range of datasets. Further investigation reveals the disentangled nature of its +emergent latent space, that serves as an effective interface to control and +manipulate the model's produced images. All in all, we aim to shed light on the +exciting and promising potential of diffusion models, not only for image +generation, but also for learning rich and robust representations. + +
+
+
+
+
+ + ☆ Knowledge Pursuit Prompting for Zero-Shot Multimodal Synthesis + + +
+ Hallucinations and unfaithful synthesis due to inaccurate prompts with +insufficient semantic details are widely observed in multimodal generative +models. A prevalent strategy to align multiple modalities is to fine-tune the +generator with a large number of annotated text-image pairs. However, such a +procedure is labor-consuming and resource-draining. The key question we ask is: +can we enhance the quality and faithfulness of text-driven generative models +beyond extensive text-image pair annotations? To address this question, we +propose Knowledge Pursuit Prompting (KPP), a zero-shot framework that +iteratively incorporates external knowledge to help generators produce reliable +visual content. Instead of training generators to handle generic prompts, KPP +employs a recursive knowledge query process to gather informative external +facts from the knowledge base, instructs a language model to compress the +acquired knowledge for prompt refinement, and utilizes text-driven generators +for visual synthesis. The entire process is zero-shot, without accessing the +architectures and parameters of generative models. We evaluate the framework +across multiple text-driven generative tasks (image, 3D rendering, and video) +on datasets of different domains. We further demonstrate the extensibility and +adaptability of KPP through varying foundation model bases and instructions. +Our results show that KPP is capable of generating faithful and semantically +rich content across diverse visual domains, offering a promising solution to +improve multimodal generative models. + +
+
+
+
+
+ + ☆ Are ensembles getting better all the time? + + +
+ Ensemble methods combine the predictions of several base models. We study +whether or not including more models in an ensemble always improve its average +performance. Such a question depends on the kind of ensemble considered, as +well as the predictive metric chosen. We focus on situations where all members +of the ensemble are a priori expected to perform as well, which is the case of +several popular methods like random forests or deep ensembles. In this setting, +we essentially show that ensembles are getting better all the time if, and only +if, the considered loss function is convex. More precisely, in that case, the +average loss of the ensemble is a decreasing function of the number of models. +When the loss function is nonconvex, we show a series of results that can be +summarised by the insight that ensembles of good models keep getting better, +and ensembles of bad models keep getting worse. To this end, we prove a new +result on the monotonicity of tail probabilities that may be of independent +interest. We illustrate our results on a simple machine learning problem +(diagnosing melanomas using neural nets). + +
+
+
+
+
+ + ☆ SAIBench: A Structural Interpretation of AI for Science Through + Benchmarks + + +
+ Artificial Intelligence for Science (AI4S) is an emerging research field that +utilizes machine learning advancements to tackle complex scientific +computational issues, aiming to enhance computational efficiency and accuracy. +However, the data-driven nature of AI4S lacks the correctness or accuracy +assurances of conventional scientific computing, posing challenges when +deploying AI4S models in real-world applications. To mitigate these, more +comprehensive benchmarking procedures are needed to better understand AI4S +models. This paper introduces a novel benchmarking approach, known as +structural interpretation, which addresses two key requirements: identifying +the trusted operating range in the problem space and tracing errors back to +their computational components. This method partitions both the problem and +metric spaces, facilitating a structural exploration of these spaces. The +practical utility and effectiveness of structural interpretation are +illustrated through its application to three distinct AI4S workloads: +machine-learning force fields (MLFF), jet tagging, and precipitation +nowcasting. The benchmarks effectively model the trusted operating range, trace +errors, and reveal novel perspectives for refining the model, training process, +and data sampling strategy. This work is part of the SAIBench project, an AI4S +benchmarking suite. + +
+
+
+
+
+ + ☆ Leveraging Graph Diffusion Models for Network Refinement Tasks + + +
+ Most real-world networks are noisy and incomplete samples from an unknown +target distribution. Refining them by correcting corruptions or inferring +unobserved regions typically improves downstream performance. Inspired by the +impressive generative capabilities that have been used to correct corruptions +in images, and the similarities between "in-painting" and filling in missing +nodes and edges conditioned on the observed graph, we propose a novel graph +generative framework, SGDM, which is based on subgraph diffusion. Our framework +not only improves the scalability and fidelity of graph diffusion models, but +also leverages the reverse process to perform novel, conditional generation +tasks. In particular, through extensive empirical analysis and a set of novel +metrics, we demonstrate that our proposed model effectively supports the +following refinement tasks for partially observable networks: T1: denoising +extraneous subgraphs, T2: expanding existing subgraphs and T3: performing +"style" transfer by regenerating a particular subgraph to match the +characteristics of a different node or subgraph. + +
+
+ comment: Work in Progress. 21 pages, 7 figures +
+
+
+
+
+ + ☆ Maximum Entropy Model Correction in Reinforcement Learning + + +
+ We propose and theoretically analyze an approach for planning with an +approximate model in reinforcement learning that can reduce the adverse impact +of model error. If the model is accurate enough, it accelerates the convergence +to the true value function too. One of its key components is the MaxEnt Model +Correction (MoCo) procedure that corrects the model's next-state distributions +based on a Maximum Entropy density estimation formulation. Based on MoCo, we +introduce the Model Correcting Value Iteration (MoCoVI) algorithm, and its +sampled-based variant MoCoDyna. We show that MoCoVI and MoCoDyna's convergence +can be much faster than the conventional model-free algorithms. Unlike +traditional model-based algorithms, MoCoVI and MoCoDyna effectively utilize an +approximate model and still converge to the correct value function. + +
+
+
+
+
+ + ☆ On the Adversarial Robustness of Graph Contrastive Learning Methods NeurIPS 2023 + + +
+ Contrastive learning (CL) has emerged as a powerful framework for learning +representations of images and text in a self-supervised manner while enhancing +model robustness against adversarial attacks. More recently, researchers have +extended the principles of contrastive learning to graph-structured data, +giving birth to the field of graph contrastive learning (GCL). However, whether +GCL methods can deliver the same advantages in adversarial robustness as their +counterparts in the image and text domains remains an open question. In this +paper, we introduce a comprehensive robustness evaluation protocol tailored to +assess the robustness of GCL models. We subject these models to adaptive +adversarial attacks targeting the graph structure, specifically in the evasion +scenario. We evaluate node and graph classification tasks using diverse +real-world datasets and attack strategies. With our work, we aim to offer +insights into the robustness of GCL methods and hope to open avenues for +potential future research directions. + +
+
+ comment: Accepted at NeurIPS 2023 New Frontiers in Graph Learning Workshop + (NeurIPS GLFrontiers 2023) +
+
+
+
+
+ + ☆ Look Before You Leap: Unveiling the Power of GPT-4V in Robotic + Vision-Language Planning + + +
+ In this study, we are interested in imbuing robots with the capability of +physically-grounded task planning. Recent advancements have shown that large +language models (LLMs) possess extensive knowledge useful in robotic tasks, +especially in reasoning and planning. However, LLMs are constrained by their +lack of world grounding and dependence on external affordance models to +perceive environmental information, which cannot jointly reason with LLMs. We +argue that a task planner should be an inherently grounded, unified multimodal +system. To this end, we introduce Robotic Vision-Language Planning (ViLa), a +novel approach for long-horizon robotic planning that leverages vision-language +models (VLMs) to generate a sequence of actionable steps. ViLa directly +integrates perceptual data into its reasoning and planning process, enabling a +profound understanding of commonsense knowledge in the visual world, including +spatial layouts and object attributes. It also supports flexible multimodal +goal specification and naturally incorporates visual feedback. Our extensive +evaluation, conducted in both real-robot and simulated environments, +demonstrates ViLa's superiority over existing LLM-based planners, highlighting +its effectiveness in a wide array of open-world manipulation tasks. + +
+
+
+
+
+ + ☆ A quasi-polynomial time algorithm for Multi-Dimensional Scaling via LP + hierarchies + + +
+ Multi-dimensional Scaling (MDS) is a family of methods for embedding +pair-wise dissimilarities between $n$ objects into low-dimensional space. MDS +is widely used as a data visualization tool in the social and biological +sciences, statistics, and machine learning. We study the Kamada-Kawai +formulation of MDS: given a set of non-negative dissimilarities $\{d_{i,j}\}_{i +, j \in [n]}$ over $n$ points, the goal is to find an embedding +$\{x_1,\dots,x_n\} \subset \mathbb{R}^k$ that minimizes \[ \text{OPT} = +\min_{x} \mathbb{E}_{i,j \in [n]} \left[ \left(1-\frac{\|x_i - +x_j\|}{d_{i,j}}\right)^2 \right] \] + Despite its popularity, our theoretical understanding of MDS is extremely +limited. Recently, Demaine, Hesterberg, Koehler, Lynch, and Urschel +(arXiv:2109.11505) gave the first approximation algorithm with provable +guarantees for Kamada-Kawai, which achieves an embedding with cost $\text{OPT} ++\epsilon$ in $n^2 \cdot 2^{\tilde{\mathcal{O}}(k \Delta^4 / \epsilon^2)}$ +time, where $\Delta$ is the aspect ratio of the input dissimilarities. In this +work, we give the first approximation algorithm for MDS with quasi-polynomial +dependency on $\Delta$: for target dimension $k$, we achieve a solution with +cost $\mathcal{O}(\text{OPT}^{ \hspace{0.04in}1/k } \cdot \log(\Delta/\epsilon) +)+ \epsilon$ in time $n^{ \mathcal{O}(1)} \cdot 2^{\tilde{\mathcal{O}}( k^2 +(\log(\Delta)/\epsilon)^{k/2 + 1} ) }$. + Our approach is based on a novel analysis of a conditioning-based rounding +scheme for the Sherali-Adams LP Hierarchy. Crucially, our analysis exploits the +geometry of low-dimensional Euclidean space, allowing us to avoid an +exponential dependence on the aspect ratio $\Delta$. We believe our +geometry-aware treatment of the Sherali-Adams Hierarchy is an important step +towards developing general-purpose techniques for efficient metric optimization +algorithms. + +
+
+
+
+
+ + ☆ Analyzing and Explaining Image Classifiers via Diffusion Guidance + + +
+ While deep learning has led to huge progress in complex image classification +tasks like ImageNet, unexpected failure modes, e.g. via spurious features, call +into question how reliably these classifiers work in the wild. Furthermore, for +safety-critical tasks the black-box nature of their decisions is problematic, +and explanations or at least methods which make decisions plausible are needed +urgently. In this paper, we address these problems by generating images that +optimize a classifier-derived objective using a framework for guided image +generation. We analyze the behavior and decisions of image classifiers by +visual counterfactual explanations (VCEs), detection of systematic mistakes by +analyzing images where classifiers maximally disagree, and visualization of +neurons to verify potential spurious features. In this way, we validate +existing observations, e.g. the shape bias of adversarially robust models, as +well as novel failure modes, e.g. systematic errors of zero-shot CLIP +classifiers, or identify harmful spurious features. Moreover, our VCEs +outperform previous work while being more versatile. + +
+
+
+
+
+ + ☆ Towards Efficient Hyperdimensional Computing Using Photonics + + +
+ Over the past few years, silicon photonics-based computing has emerged as a +promising alternative to CMOS-based computing for Deep Neural Networks (DNN). +Unfortunately, the non-linear operations and the high-precision requirements of +DNNs make it extremely challenging to design efficient silicon photonics-based +systems for DNN inference and training. Hyperdimensional Computing (HDC) is an +emerging, brain-inspired machine learning technique that enjoys several +advantages over existing DNNs, including being lightweight, requiring +low-precision operands, and being robust to noise introduced by the +nonidealities in the hardware. For HDC, computing in-memory (CiM) approaches +have been widely used, as CiM reduces the data transfer cost if the operands +can fit into the memory. However, inefficient multi-bit operations, high write +latency, and low endurance make CiM ill-suited for HDC. On the other hand, the +existing electro-photonic DNN accelerators are inefficient for HDC because they +are specifically optimized for matrix multiplication in DNNs and consume a lot +of power with high-precision data converters. + In this paper, we argue that photonic computing and HDC complement each other +better than photonic computing and DNNs, or CiM and HDC. We propose PhotoHDC, +the first-ever electro-photonic accelerator for HDC training and inference, +supporting the basic, record-based, and graph encoding schemes. Evaluating with +popular datasets, we show that our accelerator can achieve two to five orders +of magnitude lower EDP than the state-of-the-art electro-photonic DNN +accelerators for implementing HDC training and inference. PhotoHDC also +achieves four orders of magnitude lower energy-delay product than CiM-based +accelerators for both HDC training and inference. + +
+
+
+
+
+ + ☆ Learning to Simulate: Generative Metamodeling via Quantile Regression + + +
+ Stochastic simulation models, while effective in capturing the dynamics of +complex systems, are often too slow to run for real-time decision-making. +Metamodeling techniques are widely used to learn the relationship between a +summary statistic of the outputs (e.g., the mean or quantile) and the inputs of +the simulator, so that it can be used in real time. However, this methodology +requires the knowledge of an appropriate summary statistic in advance, making +it inflexible for many practical situations. In this paper, we propose a new +metamodeling concept, called generative metamodeling, which aims to construct a +"fast simulator of the simulator". This technique can generate random outputs +substantially faster than the original simulation model, while retaining an +approximately equal conditional distribution given the same inputs. Once +constructed, a generative metamodel can instantaneously generate a large amount +of random outputs as soon as the inputs are specified, thereby facilitating the +immediate computation of any summary statistic for real-time decision-making. +Furthermore, we propose a new algorithm -- quantile-regression-based generative +metamodeling (QRGMM) -- and study its convergence and rate of convergence. +Extensive numerical experiments are conducted to investigate the empirical +performance of QRGMM, compare it with other state-of-the-art generative +algorithms, and demonstrate its usefulness in practical real-time +decision-making. + +
+
+ comment: Main body: 36 pages, 7 figures; supplemental material: 12 pages +
+
+
+
+
+ + ☆ Marginal Laplacian Score + + +
+ High-dimensional imbalanced data poses a machine learning challenge. In the +absence of sufficient or high-quality labels, unsupervised feature selection +methods are crucial for the success of subsequent algorithms. Therefore, there +is a growing need for unsupervised feature selection algorithms focused on +imbalanced data. Thus, we propose a Marginal Laplacian Score (MLS) a +modification of the well-known Laplacian Score (LS) to be better suited for +imbalance data. We introduce an assumption that the minority class or anomalous +appear more frequently in the margin of the features. Consequently, MLS aims to +preserve the local structure of the data set's margin. As MLS is better suited +for handling imbalanced data, we propose its integration into modern feature +selection methods that utilize the Laplacian score. We integrate the MLS +algorithm into the Differentiable Unsupervised Feature Selection (DUFS), +resulting in DUFS-MLS. The proposed methods demonstrate robust and improved +performance on synthetic and public data sets. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Propagate & Distill: Towards Effective Graph Learners Using + Propagation-Embracing MLPs + + +
+ Recent studies attempted to utilize multilayer perceptrons (MLPs) to solve +semisupervised node classification on graphs, by training a student MLP by +knowledge distillation from a teacher graph neural network (GNN). While +previous studies have focused mostly on training the student MLP by matching +the output probability distributions between the teacher and student models +during distillation, it has not been systematically studied how to inject the +structural information in an explicit and interpretable manner. Inspired by +GNNs that separate feature transformation $T$ and propagation $\Pi$, we +re-frame the distillation process as making the student MLP learn both $T$ and +$\Pi$. Although this can be achieved by applying the inverse propagation +$\Pi^{-1}$ before distillation from the teacher, it still comes with a high +computational cost from large matrix multiplications during training. To solve +this problem, we propose Propagate & Distill (P&D), which propagates the output +of the teacher before distillation, which can be interpreted as an approximate +process of the inverse propagation. We demonstrate that P&D can readily improve +the performance of the student MLP. + +
+
+ comment: 17 pages, 2 figures, 8 tables; 2nd Learning on Graphs Conference (LoG + 2023) (Please cite our conference version.). arXiv admin note: substantial + text overlap with arXiv:2311.11759 +
+
+
+
+
+ + ☆ Unified Binary and Multiclass Margin-Based Classification + + +
+ The notion of margin loss has been central to the development and analysis of +algorithms for binary classification. To date, however, there remains no +consensus as to the analogue of the margin loss for multiclass classification. +In this work, we show that a broad range of multiclass loss functions, +including many popular ones, can be expressed in the relative margin form, a +generalization of the margin form of binary losses. The relative margin form is +broadly useful for understanding and analyzing multiclass losses as shown by +our prior work (Wang and Scott, 2020, 2021). To further demonstrate the utility +of this way of expressing multiclass losses, we use it to extend the seminal +result of Bartlett et al. (2006) on classification-calibration of binary margin +losses to multiclass. We then analyze the class of Fenchel-Young losses, and +expand the set of these losses that are known to be classification-calibrated. + +
+
+
+
+
+ + ☆ Addressing Membership Inference Attack in Federated Learning with Model + Compression + + +
+ Federated Learning (FL) has been proposed as a privacy-preserving solution +for machine learning. However, recent works have shown that Federated Learning +can leak private client data through membership attacks. In this paper, we show +that the effectiveness of these attacks on the clients negatively correlates +with the size of the client datasets and model complexity. Based on this +finding, we propose model-agnostic Federated Learning as a privacy-enhancing +solution because it enables the use of models of varying complexity in the +clients. To this end, we present $\texttt{MaPP-FL}$, a novel privacy-aware FL +approach that leverages model compression on the clients while keeping a full +model on the server. We compare the performance of $\texttt{MaPP-FL}$ against +state-of-the-art model-agnostic FL methods on the CIFAR-10, CIFAR-100, and +FEMNIST vision datasets. Our experiments show the effectiveness of +$\texttt{MaPP-FL}$ in preserving the clients' and the server's privacy while +achieving competitive classification accuracies. + +
+
+
+
+
+ + ☆ A transductive few-shot learning approach for classification of digital + histopathological slides from liver cancer + + +
+ This paper presents a new approach for classifying 2D histopathology patches +using few-shot learning. The method is designed to tackle a significant +challenge in histopathology, which is the limited availability of labeled data. +By applying a sliding window technique to histopathology slides, we illustrate +the practical benefits of transductive learning (i.e., making joint predictions +on patches) to achieve consistent and accurate classification. Our approach +involves an optimization-based strategy that actively penalizes the prediction +of a large number of distinct classes within each window. We conducted +experiments on histopathological data to classify tissue classes in digital +slides of liver cancer, specifically hepatocellular carcinoma. The initial +results show the effectiveness of our method and its potential to enhance the +process of automated cancer diagnosis and treatment, all while reducing the +time and effort required for expert annotation. + +
+
+
+
+
+ + ☆ SenTest: Evaluating Robustness of Sentence Encoders + + +
+ Contrastive learning has proven to be an effective method for pre-training +models using weakly labeled data in the vision domain. Sentence transformers +are the NLP counterparts to this architecture, and have been growing in +popularity due to their rich and effective sentence representations. Having +effective sentence representations is paramount in multiple tasks, such as +information retrieval, retrieval augmented generation (RAG), and sentence +comparison. Keeping in mind the deployability factor of transformers, +evaluating the robustness of sentence transformers is of utmost importance. +This work focuses on evaluating the robustness of the sentence encoders. We +employ several adversarial attacks to evaluate its robustness. This system uses +character-level attacks in the form of random character substitution, +word-level attacks in the form of synonym replacement, and sentence-level +attacks in the form of intra-sentence word order shuffling. The results of the +experiments strongly undermine the robustness of sentence encoders. The models +produce significantly different predictions as well as embeddings on perturbed +datasets. The accuracy of the models can fall up to 15 percent on perturbed +datasets as compared to unperturbed datasets. Furthermore, the experiments +demonstrate that these embeddings does capture the semantic and syntactic +structure (sentence order) of sentences. However, existing supervised +classification strategies fail to leverage this information, and merely +function as n-gram detectors. + +
+
+
+
+
+ + ☆ Receler: Reliable Concept Erasing of Text-to-Image Diffusion Models via + Lightweight Erasers + + +
+ Concept erasure in text-to-image diffusion models aims to disable pre-trained +diffusion models from generating images related to a target concept. To perform +reliable concept erasure, the properties of robustness and locality are +desirable. The former refrains the model from producing images associated with +the target concept for any paraphrased or learned prompts, while the latter +preserves the model ability in generating images for non-target concepts. In +this paper, we propose Reliable Concept Erasing via Lightweight Erasers +(Receler), which learns a lightweight Eraser to perform concept erasing and +enhances locality and robustness with the proposed concept-localized +regularization and adversarial prompt learning, respectively. Comprehensive +quantitative and qualitative experiments with various concept prompts verify +the superiority of Receler over the previous erasing methods on the above two +desirable properties. + +
+
+
+
+
+ + ☆ Fair Text-to-Image Diffusion via Fair Mapping + + +
+ In this paper, we address the limitations of existing text-to-image diffusion +models in generating demographically fair results when given human-related +descriptions. These models often struggle to disentangle the target language +context from sociocultural biases, resulting in biased image generation. To +overcome this challenge, we propose Fair Mapping, a general, model-agnostic, +and lightweight approach that modifies a pre-trained text-to-image model by +controlling the prompt to achieve fair image generation. One key advantage of +our approach is its high efficiency. The training process only requires +updating a small number of parameters in an additional linear mapping network. +This not only reduces the computational cost but also accelerates the +optimization process. We first demonstrate the issue of bias in generated +results caused by language biases in text-guided diffusion models. By +developing a mapping network that projects language embeddings into an unbiased +space, we enable the generation of relatively balanced demographic results +based on a keyword specified in the prompt. With comprehensive experiments on +face image generation, we show that our method significantly improves image +generation performance when prompted with descriptions related to human faces. +By effectively addressing the issue of bias, we produce more fair and diverse +image outputs. This work contributes to the field of text-to-image generation +by enhancing the ability to generate images that accurately reflect the +intended demographic characteristics specified in the text. + +
+
+
+
+
+ + ☆ Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using + Reinforcement and Imitation Learning + + +
+ Robotic-assisted surgical systems have demonstrated significant potential in +enhancing surgical precision and minimizing human errors. However, existing +systems lack the ability to accommodate the unique preferences and requirements +of individual surgeons. Additionally, they primarily focus on general surgeries +(e.g., laparoscopy) and are not suitable for highly precise microsurgeries, +such as ophthalmic procedures. Thus, we propose a simulation-based image-guided +approach for surgeon-centered autonomous agents that can adapt to the +individual surgeon's skill level and preferred surgical techniques during +ophthalmic cataract surgery. Our approach utilizes a simulated environment to +train reinforcement and imitation learning agents guided by image data to +perform all tasks of the incision phase of cataract surgery. By integrating the +surgeon's actions and preferences into the training process with the +surgeon-in-the-loop, our approach enables the robot to implicitly learn and +adapt to the individual surgeon's unique approach through demonstrations. This +results in a more intuitive and personalized surgical experience for the +surgeon. Simultaneously, it ensures consistent performance for the autonomous +robotic apprentice. We define and evaluate the effectiveness of our approach +using our proposed metrics; and highlight the trade-off between a generic agent +and a surgeon-centered adapted agent. Moreover, our approach has the potential +to extend to other ophthalmic surgical procedures, opening the door to a new +generation of surgeon-in-the-loop autonomous surgical robots. We provide an +open-source simulation framework for future development and reproducibility. + +
+
+
+
+
+ + ☆ Using Ornstein-Uhlenbeck Process to understand Denoising Diffusion + Probabilistic Model and its Noise Schedules + + +
+ The aim of this short note is to show that Denoising Diffusion Probabilistic +Model DDPM, a non-homogeneous discrete-time Markov process, can be represented +by a time-homogeneous continuous-time Markov process observed at non-uniformly +sampled discrete times. Surprisingly, this continuous-time Markov process is +the well-known and well-studied Ornstein-Ohlenbeck (OU) process, which was +developed in 1930's for studying Brownian particles in Harmonic potentials. We +establish the formal equivalence between DDPM and the OU process using its +analytical solution. We further demonstrate that the design problem of the +noise scheduler for non-homogeneous DDPM is equivalent to designing observation +times for the OU process. We present several heuristic designs for observation +times based on principled quantities such as auto-variance and Fisher +Information and connect them to ad hoc noise schedules for DDPM. Interestingly, +we show that the Fisher-Information-motivated schedule corresponds exactly the +cosine schedule, which was developed without any theoretical foundation but is +the current state-of-the-art noise schedule. + +
+
+
+
+
+ + ☆ A novel feature selection method based on quantum support vector machine + + +
+ Feature selection is critical in machine learning to reduce dimensionality +and improve model accuracy and efficiency. The exponential growth in feature +space dimensionality for modern datasets directly results in ambiguous samples +and redundant features, which can severely degrade classification accuracy. +Quantum machine learning offers potential advantages for addressing this +challenge. In this paper, we propose a novel method, quantum support vector +machine feature selection (QSVMF), integrating quantum support vector machines +with multi-objective genetic algorithm. QSVMF optimizes multiple simultaneous +objectives: maximizing classification accuracy, minimizing selected features +and quantum circuit costs, and reducing feature covariance. We apply QSVMF for +feature selection on a breast cancer dataset, comparing the performance of +QSVMF against classical approaches with the selected features. Experimental +results show that QSVMF achieves superior performance. Furthermore, The Pareto +front solutions of QSVMF enable analysis of accuracy versus feature set size +trade-offs, identifying extremely sparse yet accurate feature subsets. We +contextualize the biological relevance of the selected features in terms of +known breast cancer biomarkers. This work highlights the potential of +quantum-based feature selection to enhance machine learning efficiency and +performance on complex real-world data. + +
+
+
+
+
+ + ☆ Introduction to Transformers: an NLP Perspective + + +
+ Transformers have dominated empirical machine learning models of natural +language processing. In this paper, we introduce basic concepts of Transformers +and present key techniques that form the recent advances of these models. This +includes a description of the standard Transformer architecture, a series of +model refinements, and common applications. Given that Transformers and related +deep learning techniques might be evolving in ways we have never seen, we +cannot dive into all the model details or cover all the technical areas. +Instead, we focus on just those concepts that are helpful for gaining a good +understanding of Transformers and their variants. We also summarize the key +ideas that impact this field, thereby yielding some insights into the strengths +and limitations of these models. + +
+
+ comment: 119 pages and 21 figures +
+
+
+
+
+ + ☆ Q-learning Based Optimal False Data Injection Attack on Probabilistic + Boolean Control Networks + + +
+ In this paper, we present a reinforcement learning (RL) method for solving +optimal false data injection attack problems in probabilistic Boolean control +networks (PBCNs) where the attacker lacks knowledge of the system model. +Specifically, we employ a Q-learning (QL) algorithm to address this problem. We +then propose an improved QL algorithm that not only enhances learning +efficiency but also obtains optimal attack strategies for large-scale PBCNs +that the standard QL algorithm cannot handle. Finally, we verify the +effectiveness of our proposed approach by considering two attacked PBCNs, +including a 10-node network and a 28-node network. + +
+
+
+
+
+ + ☆ AnyLens: A Generative Diffusion Model with Any Rendering Lens + + +
+ State-of-the-art diffusion models can generate highly realistic images based +on various conditioning like text, segmentation, and depth. However, an +essential aspect often overlooked is the specific camera geometry used during +image capture. The influence of different optical systems on the final scene +appearance is frequently overlooked. This study introduces a framework that +intimately integrates a text-to-image diffusion model with the particular lens +geometry used in image rendering. Our method is based on a per-pixel coordinate +conditioning method, enabling the control over the rendering geometry. Notably, +we demonstrate the manipulation of curvature properties, achieving diverse +visual effects, such as fish-eye, panoramic views, and spherical texturing +using a single diffusion model. + +
+
+
+
+
+ + ☆ Adversarial Robust Memory-Based Continual Learner + + +
+ Despite the remarkable advances that have been made in continual learning, +the adversarial vulnerability of such methods has not been fully discussed. We +delve into the adversarial robustness of memory-based continual learning +algorithms and observe limited robustness improvement by directly applying +adversarial training techniques. Preliminary studies reveal the twin challenges +for building adversarial robust continual learners: accelerated forgetting in +continual learning and gradient obfuscation in adversarial robustness. In this +study, we put forward a novel adversarial robust memory-based continual learner +that adjusts data logits to mitigate the forgetting of pasts caused by +adversarial samples. Furthermore, we devise a gradient-based data selection +mechanism to overcome the gradient obfuscation caused by limited stored data. +The proposed approach can widely integrate with existing memory-based continual +learning as well as adversarial training algorithms in a plug-and-play way. +Extensive experiments on Split-CIFAR10/100 and Split-Tiny-ImageNet demonstrate +the effectiveness of our approach, achieving up to 8.13% higher accuracy for +adversarial data. + +
+
+
+
+
+ + ☆ Topology-Preserving Adversarial Training + + +
+ Despite the effectiveness in improving the robustness of neural networks, +adversarial training has suffered from the natural accuracy degradation +problem, i.e., accuracy on natural samples has reduced significantly. In this +study, we reveal that natural accuracy degradation is highly related to the +disruption of the natural sample topology in the representation space by +quantitative and qualitative experiments. Based on this observation, we propose +Topology-pReserving Adversarial traINing (TRAIN) to alleviate the problem by +preserving the topology structure of natural samples from a standard model +trained only on natural samples during adversarial training. As an additional +regularization, our method can easily be combined with various popular +adversarial training algorithms in a plug-and-play manner, taking advantage of +both sides. Extensive experiments on CIFAR-10, CIFAR-100, and Tiny ImageNet +show that our proposed method achieves consistent and significant improvements +over various strong baselines in most cases. Specifically, without additional +data, our proposed method achieves up to 8.78% improvement in natural accuracy +and 4.50% improvement in robust accuracy. + +
+
+
+
+
+ + ☆ Continual Learning with Low Rank Adaptation NeurIPS 2023 + + +
+ Recent work using pretrained transformers has shown impressive performance +when fine-tuned with data from the downstream problem of interest. However, +they struggle to retain that performance when the data characteristics changes. +In this paper, we focus on continual learning, where a pre-trained transformer +is updated to perform well on new data, while retaining its performance on data +it was previously trained on. Earlier works have tackled this primarily through +methods inspired from prompt tuning. We question this choice, and investigate +the applicability of Low Rank Adaptation (LoRA) to continual learning. On a +range of domain-incremental learning benchmarks, our LoRA-based solution, +CoLoR, yields state-of-the-art performance, while still being as parameter +efficient as the prompt tuning based methods. + +
+
+ comment: Accepted at Workshop on Distribution Shifts (DistShift), NeurIPS 2023 +
+
+
+
+
+ + ☆ Improving embedding of graphs with missing data by soft manifolds + + +
+ Embedding graphs in continous spaces is a key factor in designing and +developing algorithms for automatic information extraction to be applied in +diverse tasks (e.g., learning, inferring, predicting). The reliability of graph +embeddings directly depends on how much the geometry of the continuous space +matches the graph structure. Manifolds are mathematical structure that can +enable to incorporate in their topological spaces the graph characteristics, +and in particular nodes distances. State-of-the-art of manifold-based graph +embedding algorithms take advantage of the assumption that the projection on a +tangential space of each point in the manifold (corresponding to a node in the +graph) would locally resemble a Euclidean space. Although this condition helps +in achieving efficient analytical solutions to the embedding problem, it does +not represent an adequate set-up to work with modern real life graphs, that are +characterized by weighted connections across nodes often computed over sparse +datasets with missing records. In this work, we introduce a new class of +manifold, named soft manifold, that can solve this situation. In particular, +soft manifolds are mathematical structures with spherical symmetry where the +tangent spaces to each point are hypocycloids whose shape is defined according +to the velocity of information propagation across the data points. Using soft +manifolds for graph embedding, we can provide continuous spaces to pursue any +task in data analysis over complex datasets. Experimental results on +reconstruction tasks on synthetic and real datasets show how the proposed +approach enable more accurate and reliable characterization of graphs in +continuous spaces with respect to the state-of-the-art. + +
+
+
+
+
+ + ☆ LanGWM: Language Grounded World Model + + +
+ Recent advances in deep reinforcement learning have showcased its potential +in tackling complex tasks. However, experiments on visual control tasks have +revealed that state-of-the-art reinforcement learning models struggle with +out-of-distribution generalization. Conversely, expressing higher-level +concepts and global contexts is relatively easy using language. + Building upon recent success of the large language models, our main objective +is to improve the state abstraction technique in reinforcement learning by +leveraging language for robust action selection. Specifically, we focus on +learning language-grounded visual features to enhance the world model learning, +a model-based reinforcement learning technique. + To enforce our hypothesis explicitly, we mask out the bounding boxes of a few +objects in the image observation and provide the text prompt as descriptions +for these masked objects. Subsequently, we predict the masked objects along +with the surrounding regions as pixel reconstruction, similar to the +transformer-based masked autoencoder approach. + Our proposed LanGWM: Language Grounded World Model achieves state-of-the-art +performance in out-of-distribution test at the 100K interaction steps +benchmarks of iGibson point navigation tasks. Furthermore, our proposed +technique of explicit language-grounded visual representation learning has the +potential to improve models for human-robot interaction because our extracted +visual features are language grounded. + +
+
+
+
+
+ + ☆ Federated Online and Bandit Convex Optimization + + +
+ We study the problems of distributed online and bandit convex optimization +against an adaptive adversary. We aim to minimize the average regret on $M$ +machines working in parallel over $T$ rounds with $R$ intermittent +communications. Assuming the underlying cost functions are convex and can be +generated adaptively, our results show that collaboration is not beneficial +when the machines have access to the first-order gradient information at the +queried points. This is in contrast to the case for stochastic functions, where +each machine samples the cost functions from a fixed distribution. Furthermore, +we delve into the more challenging setting of federated online optimization +with bandit (zeroth-order) feedback, where the machines can only access values +of the cost functions at the queried points. The key finding here is +identifying the high-dimensional regime where collaboration is beneficial and +may even lead to a linear speedup in the number of machines. We further +illustrate our findings through federated adversarial linear bandits by +developing novel distributed single and two-point feedback algorithms. Our work +is the first attempt towards a systematic understanding of federated online +optimization with limited feedback, and it attains tight regret bounds in the +intermittent communication setting for both first and zeroth-order feedback. +Our results thus bridge the gap between stochastic and adaptive settings in +federated online optimization. + +
+
+
+
+
+ + ☆ LoCoMotif: Discovering time-warped motifs in time series ECML + + +
+ Time Series Motif Discovery (TSMD) refers to the task of identifying patterns +that occur multiple times (possibly with minor variations) in a time series. +All existing methods for TSMD have one or more of the following limitations: +they only look for the two most similar occurrences of a pattern; they only +look for patterns of a pre-specified, fixed length; they cannot handle +variability along the time axis; and they only handle univariate time series. +In this paper, we present a new method, LoCoMotif, that has none of these +limitations. The method is motivated by a concrete use case from physiotherapy. +We demonstrate the value of the proposed method on this use case. We also +introduce a new quantitative evaluation metric for motif discovery, and +benchmark data for comparing TSMD methods. LoCoMotif substantially outperforms +the existing methods, on top of being more broadly applicable. + +
+
+ comment: 26 pages, 15 figures. Submitted to the journal track of the European + Conference on Machine Learning and Principles and Practice of Knowledge + Discovery in Databases (ECMLPKDD) 2024 in partnership with the Data Mining + and Knowledge Discovery journal. Source code of the method is available at + http://github.com/ML-KULeuven/locomotif +
+
+
+
+
+ + ☆ Bias Resilient Multi-Step Off-Policy Goal-Conditioned Reinforcement + Learning + + +
+ In goal-conditioned reinforcement learning (GCRL), sparse rewards present +significant challenges, often obstructing efficient learning. Although +multi-step GCRL can boost this efficiency, it can also lead to off-policy +biases in target values. This paper dives deep into these biases, categorizing +them into two distinct categories: "shooting" and "shifting". Recognizing that +certain behavior policies can hasten policy refinement, we present solutions +designed to capitalize on the positive aspects of these biases while minimizing +their drawbacks, enabling the use of larger step sizes to speed up GCRL. An +empirical study demonstrates that our approach ensures a resilient and robust +improvement, even in ten-step learning scenarios, leading to superior learning +efficiency and performance that generally surpass the baseline and several +state-of-the-art multi-step GCRL benchmarks. + +
+
+ comment: 26 pages, 7 figures +
+
+
+
+
+ + ☆ Interpreting Differentiable Latent States for Healthcare Time-series + Data + + +
+ Machine learning enables extracting clinical insights from large temporal +datasets. The applications of such machine learning models include identifying +disease patterns and predicting patient outcomes. However, limited +interpretability poses challenges for deploying advanced machine learning in +digital healthcare. Understanding the meaning of latent states is crucial for +interpreting machine learning models, assuming they capture underlying +patterns. In this paper, we present a concise algorithm that allows for i) +interpreting latent states using highly related input features; ii) +interpreting predictions using subsets of input features via latent states; and +iii) interpreting changes in latent states over time. The proposed algorithm is +feasible for any model that is differentiable. We demonstrate that this +approach enables the identification of a daytime behavioral pattern for +predicting nocturnal behavior in a real-world healthcare dataset. + +
+
+
+
+
+ + ☆ An Efficient Illumination Invariant Tiger Detection Framework for + Wildlife Surveillance + + +
+ Tiger conservation necessitates the strategic deployment of multifaceted +initiatives encompassing the preservation of ecological habitats, anti-poaching +measures, and community involvement for sustainable growth in the tiger +population. With the advent of artificial intelligence, tiger surveillance can +be automated using object detection. In this paper, an accurate illumination +invariant framework is proposed based on EnlightenGAN and YOLOv8 for tiger +detection. The fine-tuned YOLOv8 model achieves a mAP score of 61% without +illumination enhancement. The illumination enhancement improves the mAP by +0.7%. The approaches elevate the state-of-the-art performance on the ATRW +dataset by approximately 6% to 7%. + +
+
+ comment: accepted at ICCIS 2023 +
+
+
+
+
+ + ☆ The Effects of Overparameterization on Sharpness-aware Minimization: An + Empirical and Theoretical Analysis + + +
+ Training an overparameterized neural network can yield minimizers of the same +level of training loss and yet different generalization capabilities. With +evidence that indicates a correlation between sharpness of minima and their +generalization errors, increasing efforts have been made to develop an +optimization method to explicitly find flat minima as more generalizable +solutions. This sharpness-aware minimization (SAM) strategy, however, has not +been studied much yet as to how overparameterization can actually affect its +behavior. In this work, we analyze SAM under varying degrees of +overparameterization and present both empirical and theoretical results that +suggest a critical influence of overparameterization on SAM. Specifically, we +first use standard techniques in optimization to prove that SAM can achieve a +linear convergence rate under overparameterization in a stochastic setting. We +also show that the linearly stable minima found by SAM are indeed flatter and +have more uniformly distributed Hessian moments compared to those of SGD. These +results are corroborated with our experiments that reveal a consistent trend +that the generalization improvement made by SAM continues to increase as the +model becomes more overparameterized. We further present that sparsity can open +up an avenue for effective overparameterization in practice. + +
+
+
+
+
+ + ☆ The devil is in the fine-grained details: Evaluating open-vocabulary + object detectors for fine-grained understanding + + +
+ Recent advancements in large vision-language models enabled visual object +detection in open-vocabulary scenarios, where object classes are defined in +free-text formats during inference. In this paper, we aim to probe the +state-of-the-art methods for open-vocabulary object detection to determine to +what extent they understand fine-grained properties of objects and their parts. +To this end, we introduce an evaluation protocol based on dynamic vocabulary +generation to test whether models detect, discern, and assign the correct +fine-grained description to objects in the presence of hard-negative classes. +We contribute with a benchmark suite of increasing difficulty and probing +different properties like color, pattern, and material. We further enhance our +investigation by evaluating several state-of-the-art open-vocabulary object +detectors using the proposed protocol and find that most existing solutions, +which shine in standard open-vocabulary benchmarks, struggle to accurately +capture and distinguish finer object details. We conclude the paper by +highlighting the limitations of current methodologies and exploring promising +research directions to overcome the discovered drawbacks. Data and code are +available at https://github.com/lorebianchi98/FG-OVD. + +
+
+
+
+
+ + ☆ Model Performance Prediction for Hyperparameter Optimization of Deep + Learning Models Using High Performance Computing and Quantum Annealing + + +
+ Hyperparameter Optimization (HPO) of Deep Learning-based models tends to be a +compute resource intensive process as it usually requires to train the target +model with many different hyperparameter configurations. We show that +integrating model performance prediction with early stopping methods holds +great potential to speed up the HPO process of deep learning models. Moreover, +we propose a novel algorithm called Swift-Hyperband that can use either +classical or quantum support vector regression for performance prediction and +benefit from distributed High Performance Computing environments. This +algorithm is tested not only for the Machine-Learned Particle Flow model used +in High Energy Physics, but also for a wider range of target models from +domains such as computer vision and natural language processing. +Swift-Hyperband is shown to find comparable (or better) hyperparameters as well +as using less computational resources in all test cases. + +
+
+
+
+
+ + ☆ Slot-Mixup with Subsampling: A Simple Regularization for WSI + Classification + + +
+ Whole slide image (WSI) classification requires repetitive zoom-in and out +for pathologists, as only small portions of the slide may be relevant to +detecting cancer. Due to the lack of patch-level labels, multiple instance +learning (MIL) is a common practice for training a WSI classifier. One of the +challenges in MIL for WSIs is the weak supervision coming only from the +slide-level labels, often resulting in severe overfitting. In response, +researchers have considered adopting patch-level augmentation or applying mixup +augmentation, but their applicability remains unverified. Our approach augments +the training dataset by sampling a subset of patches in the WSI without +significantly altering the underlying semantics of the original slides. +Additionally, we introduce an efficient model (Slot-MIL) that organizes patches +into a fixed number of slots, the abstract representation of patches, using an +attention mechanism. We empirically demonstrate that the subsampling +augmentation helps to make more informative slots by restricting the +over-concentration of attention and to improve interpretability. Finally, we +illustrate that combining our attention-based aggregation model with +subsampling and mixup, which has shown limited compatibility in existing MIL +methods, can enhance both generalization and calibration. Our proposed methods +achieve the state-of-the-art performance across various benchmark datasets +including class imbalance and distribution shifts. + +
+
+
+
+
+ + ☆ Wireless Network Digital Twin for 6G: Generative AI as A Key Enabler + + +
+ Digital twin, which enables emulation, evaluation, and optimization of +physical entities through synchronized digital replicas, has gained +increasingly attention as a promising technology for intricate wireless +networks. For 6G, numerous innovative wireless technologies and network +architectures have posed new challenges in establishing wireless network +digital twins. To tackle these challenges, artificial intelligence (AI), +particularly the flourishing generative AI, emerges as a potential solution. In +this article, we discuss emerging prerequisites for wireless network digital +twins considering the complicated network architecture, tremendous network +scale, extensive coverage, and diversified application scenarios in the 6G era. +We further explore the applications of generative AI, such as transformer and +diffusion model, to empower the 6G digital twin from multiple perspectives +including implementation, physical-digital synchronization, and slicing +capability. Subsequently, we propose a hierarchical generative AI-enabled +wireless network digital twin at both the message-level and policy-level, and +provide a typical use case with numerical results to validate the effectiveness +and efficiency. Finally, open research issues for wireless network digital +twins in the 6G era are discussed. + +
+
+
+
+
+ + ☆ Uncertainty in Additive Feature Attribution methods + + +
+ In this work, we explore various topics that fall under the umbrella of +Uncertainty in post-hoc Explainable AI (XAI) methods. We in particular focus on +the class of additive feature attribution explanation methods. We first +describe our specifications of uncertainty and compare various statistical and +recent methods to quantify the same. Next, for a particular instance, we study +the relationship between a feature's attribution and its uncertainty and +observe little correlation. As a result, we propose a modification in the +distribution from which perturbations are sampled in LIME-based algorithms such +that the important features have minimal uncertainty without an increase in +computational cost. Next, while studying how the uncertainty in explanations +varies across the feature space of a classifier, we observe that a fraction of +instances show near-zero uncertainty. We coin the term "stable instances" for +such instances and diagnose factors that make an instance stable. Next, we +study how an XAI algorithm's uncertainty varies with the size and complexity of +the underlying model. We observe that the more complex the model, the more +inherent uncertainty is exhibited by it. As a result, we propose a measure to +quantify the relative complexity of a blackbox classifier. This could be +incorporated, for example, in LIME-based algorithms' sampling densities, to +help different explanation algorithms achieve tighter confidence levels. +Together, the above measures would have a strong impact on making XAI models +relatively trustworthy for the end-user as well as aiding scientific discovery. + +
+
+ comment: 14 +
+
+
+
+
+ + ☆ Group-wise Sparse and Explainable Adversarial Attacks + + +
+ Sparse adversarial attacks fool deep neural networks (DNNs) through minimal +pixel perturbations, typically regularized by the $\ell_0$ norm. Recent efforts +have replaced this norm with a structural sparsity regularizer, such as the +nuclear group norm, to craft group-wise sparse adversarial attacks. The +resulting perturbations are thus explainable and hold significant practical +relevance, shedding light on an even greater vulnerability of DNNs than +previously anticipated. However, crafting such attacks poses an optimization +challenge, as it involves computing norms for groups of pixels within a +non-convex objective. In this paper, we tackle this challenge by presenting an +algorithm that simultaneously generates group-wise sparse attacks within +semantically meaningful areas of an image. In each iteration, the core +operation of our algorithm involves the optimization of a quasinorm adversarial +loss. This optimization is achieved by employing the $1/2$-quasinorm proximal +operator for some iterations, a method tailored for nonconvex programming. +Subsequently, the algorithm transitions to a projected Nesterov's accelerated +gradient descent with $2$-norm regularization applied to perturbation +magnitudes. We rigorously evaluate the efficacy of our novel attack in both +targeted and non-targeted attack scenarios, on CIFAR-10 and ImageNet datasets. +When compared to state-of-the-art methods, our attack consistently results in a +remarkable increase in group-wise sparsity, e.g., an increase of $48.12\%$ on +CIFAR-10 and $40.78\%$ on ImageNet (average case, targeted attack), all while +maintaining lower perturbation magnitudes. Notably, this performance is +complemented by a significantly faster computation time and a $100\%$ attack +success rate. + +
+
+
+
+
+ + ☆ Grounding Foundation Models through Federated Transfer Learning: A + General Framework + + +
+ Foundation Models (FMs) such as GPT-4 encoded with vast knowledge and +powerful emergent abilities have achieved remarkable success in various natural +language processing and computer vision tasks. Grounding FMs by adapting them +to domain-specific tasks or augmenting them with domain-specific knowledge +enables us to exploit the full potential of FMs. However, grounding FMs faces +several challenges, stemming primarily from constrained computing resources, +data privacy, model heterogeneity, and model ownership. Federated Transfer +Learning (FTL), the combination of federated learning and transfer learning, +provides promising solutions to address these challenges. In recent years, the +need for grounding FMs leveraging FTL, coined FTL-FM, has arisen strongly in +both academia and industry. Motivated by the strong growth in FTL-FM research +and the potential impact of FTL-FM on industrial applications, we propose an +FTL-FM framework that formulates problems of grounding FMs in the federated +learning setting, construct a detailed taxonomy based on the FTL-FM framework +to categorize state-of-the-art FTL-FM works, and comprehensively overview +FTL-FM works based on the proposed taxonomy. We also establish correspondences +between FTL-FM and conventional phases of adapting FM so that FM practitioners +can align their research works with FTL-FM. In addition, we overview advanced +efficiency-improving and privacy-preserving techniques because efficiency and +privacy are critical concerns in FTL-FM. Last, we discuss opportunities and +future research directions of FTL-FM. + +
+
+ comment: in progress +
+
+
+
+
+ + ☆ GNNFlow: A Distributed Framework for Continuous Temporal GNN Learning on + Dynamic Graphs + + +
+ Graph Neural Networks (GNNs) play a crucial role in various fields. However, +most existing deep graph learning frameworks assume pre-stored static graphs +and do not support training on graph streams. In contrast, many real-world +graphs are dynamic and contain time domain information. We introduce GNNFlow, a +distributed framework that enables efficient continuous temporal graph +representation learning on dynamic graphs on multi-GPU machines. GNNFlow +introduces an adaptive time-indexed block-based data structure that effectively +balances memory usage with graph update and sampling operation efficiency. It +features a hybrid GPU-CPU graph data placement for rapid GPU-based temporal +neighborhood sampling and kernel optimizations for enhanced sampling processes. +A dynamic GPU cache for node and edge features is developed to maximize cache +hit rates through reuse and restoration strategies. GNNFlow supports +distributed training across multiple machines with static scheduling to ensure +load balance. We implement GNNFlow based on DGL and PyTorch. Our experimental +results show that GNNFlow provides up to 21.1x faster continuous learning than +existing systems. + +
+
+
+
+
+ + ☆ Gene-MOE: A Sparsely-gated Framework for Pan-Cancer Genomic Analysis + + +
+ Analyzing the genomic information from the Pan-Cancer database can help us +understand cancer-related factors and contribute to the cancer diagnosis and +prognosis. However, existing computational methods and deep learning methods +can not effectively find the deep correlations between tens of thousands of +genes, which leads to precision loss. In this paper, we proposed a novel +pretrained model called Gene-MOE to learn the general feature representations +of the Pan-Cancer dataset and transfer the pretrained weights to the downstream +tasks. The Gene-MOE fully exploits the mixture of expert (MOE) layers to learn +rich feature representations of high-dimensional genes. At the same time, we +build a mixture of attention expert (MOAE) model to learn the deep semantic +relationships within genetic features. Finally, we proposed a new +self-supervised pretraining strategy including loss function design, data +enhancement, and optimization strategy to train the Gene-MOE and further +improve the performance for the downstream analysis. We carried out cancer +classification and survival analysis experiments based on the Gene-MOE. +According to the survival analysis results on 14 cancer types, using Gene-MOE +outperformed state-of-the-art models on 12 cancer types. According to the +classification results, the total accuracy of the classification model for 33 +cancer classifications reached 95.2\%. Through detailed feature analysis, we +found the Gene-MOE model can learn rich feature representations of +high-dimensional genes. + +
+
+ comment: submit to bioinformatics +
+
+
+
+
+ + ☆ Improving the Robustness of Transformer-based Large Language Models with + Dynamic Attention + + +
+ Transformer-based models, such as BERT and GPT, have been widely adopted in +natural language processing (NLP) due to their exceptional performance. +However, recent studies show their vulnerability to textual adversarial attacks +where the model's output can be misled by intentionally manipulating the text +inputs. Despite various methods that have been proposed to enhance the model's +robustness and mitigate this vulnerability, many require heavy consumption +resources (e.g., adversarial training) or only provide limited protection +(e.g., defensive dropout). In this paper, we propose a novel method called +dynamic attention, tailored for the transformer architecture, to enhance the +inherent robustness of the model itself against various adversarial attacks. +Our method requires no downstream task knowledge and does not incur additional +costs. The proposed dynamic attention consists of two modules: (I) attention +rectification, which masks or weakens the attention value of the chosen tokens, +and (ii) dynamic modeling, which dynamically builds the set of candidate +tokens. Extensive experiments demonstrate that dynamic attention significantly +mitigates the impact of adversarial attacks, improving up to 33\% better +performance than previous methods against widely-used adversarial attacks. The +model-level design of dynamic attention enables it to be easily combined with +other defense methods (e.g., adversarial training) to further enhance the +model's robustness. Furthermore, we demonstrate that dynamic attention +preserves the state-of-the-art robustness space of the original model compared +to other dynamic modeling methods. + +
+
+
+
+
+ + ☆ The Devil is in the Data: Learning Fair Graph Neural Networks via + Partial Knowledge Distillation WSDM 2024 + + +
+ Graph neural networks (GNNs) are being increasingly used in many high-stakes +tasks, and as a result, there is growing attention on their fairness recently. +GNNs have been shown to be unfair as they tend to make discriminatory decisions +toward certain demographic groups, divided by sensitive attributes such as +gender and race. While recent works have been devoted to improving their +fairness performance, they often require accessible demographic information. +This greatly limits their applicability in real-world scenarios due to legal +restrictions. To address this problem, we present a demographic-agnostic method +to learn fair GNNs via knowledge distillation, namely FairGKD. Our work is +motivated by the empirical observation that training GNNs on partial data +(i.e., only node attributes or topology data) can improve their fairness, +albeit at the cost of utility. To make a balanced trade-off between fairness +and utility performance, we employ a set of fairness experts (i.e., GNNs +trained on different partial data) to construct the synthetic teacher, which +distills fairer and informative knowledge to guide the learning of the GNN +student. Experiments on several benchmark datasets demonstrate that FairGKD, +which does not require access to demographic information, significantly +improves the fairness of GNNs by a large margin while maintaining their +utility. + +
+
+ comment: Accepted by WSDM 2024 +
+
+
+
+
+ + ☆ Continuous optimization by quantum adaptive distribution search + + +
+ In this paper, we introduce the quantum adaptive distribution search (QuADS), +a quantum continuous optimization algorithm that integrates Grover adaptive +search (GAS) with the covariance matrix adaptation - evolution strategy +(CMA-ES), a classical technique for continuous optimization. QuADS utilizes the +quantum-based search capabilities of GAS and enhances them with the principles +of CMA-ES for more efficient optimization. It employs a multivariate normal +distribution for the initial state of the quantum search and repeatedly updates +it throughout the optimization process. Our numerical experiments show that +QuADS outperforms both GAS and CMA-ES. This is achieved through adaptive +refinement of the initial state distribution rather than consistently using a +uniform state, resulting in fewer oracle calls. This study presents an +important step toward exploiting the potential of quantum computing for +continuous optimization. + +
+
+
+
+
+ + ☆ Efficient Stitchable Task Adaptation + + +
+ The paradigm of pre-training and fine-tuning has laid the foundation for +deploying deep learning models. However, most fine-tuning methods are designed +to meet a specific resource budget. Recently, considering diverse deployment +scenarios with various resource budgets, stitchable neural network (SN-Net) is +introduced to quickly obtain numerous new networks (stitches) from the +pre-trained models (anchors) in a model family via model stitching. Although +promising, SN-Net confronts new challenges when adapting it to new target +domains, including huge memory and storage requirements and a long and +sub-optimal multistage adaptation process. In this work, we present a novel +framework, Efficient Stitchable Task Adaptation (ESTA), to efficiently produce +a palette of fine-tuned models that adhere to diverse resource constraints. +Specifically, we first tailor parameter-efficient fine-tuning to share low-rank +updates among the stitches while maintaining independent bias terms. In this +way, we largely reduce fine-tuning memory burdens and mitigate the interference +among stitches that arises in task adaptation. Furthermore, we streamline a +simple yet effective one-stage deployment pipeline, which estimates the +important stitches to deploy with training-time gradient statistics. By +assigning higher sampling probabilities to important stitches, we also get a +boosted Pareto frontier. Extensive experiments on 25 downstream visual +recognition tasks demonstrate that our ESTA is capable of generating stitches +with smooth accuracy-efficiency trade-offs and surpasses the direct SN-Net +adaptation by remarkable margins with significantly lower training time and +fewer trainable parameters. Furthermore, we demonstrate the flexibility and +scalability of our ESTA framework by stitching LLMs from LLaMA family, +obtaining chatbot stitches of assorted sizes. + +
+
+ comment: Source code will be released at + https://github.com/ziplab/Stitched_LLaMA +
+
+
+
+
+ + ☆ Improving Self-supervised Molecular Representation Learning using + Persistent Homology NeurIPS 2023 + + +
+ Self-supervised learning (SSL) has great potential for molecular +representation learning given the complexity of molecular graphs, the large +amounts of unlabelled data available, the considerable cost of obtaining labels +experimentally, and the hence often only small training datasets. The +importance of the topic is reflected in the variety of paradigms and +architectures that have been investigated recently. Yet the differences in +performance seem often minor and are barely understood to date. In this paper, +we study SSL based on persistent homology (PH), a mathematical tool for +modeling topological features of data that persist across multiple scales. It +has several unique features which particularly suit SSL, naturally offering: +different views of the data, stability in terms of distance preservation, and +the opportunity to flexibly incorporate domain knowledge. We (1) investigate an +autoencoder, which shows the general representational power of PH, and (2) +propose a contrastive loss that complements existing approaches. We rigorously +evaluate our approach for molecular property prediction and demonstrate its +particular features in improving the embedding space: after SSL, the +representations are better and offer considerably more predictive power than +the baselines over different probing tasks; our loss increases baseline +performance, sometimes largely; and we often obtain substantial improvements +over very small datasets, a common scenario in practice. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ Mostly Beneficial Clustering: Aggregating Data for Operational Decision + Making + + +
+ With increasingly volatile market conditions and rapid product innovations, +operational decision-making for large-scale systems entails solving thousands +of problems with limited data. Data aggregation is proposed to combine the data +across problems to improve the decisions obtained by solving those problems +individually. We propose a novel cluster-based shrunken-SAA approach that can +exploit the cluster structure among problems when implementing the data +aggregation approaches. We prove that, as the number of problems grows, +leveraging the known cluster structure among problems yields additional +benefits over the data aggregation approaches that neglect such structure. When +the cluster structure is unknown, we show that unveiling the cluster structure, +even at the cost of a few data points, can be beneficial, especially when the +distance between clusters of problems is substantial. Our proposed approach can +be extended to general cost functions under mild conditions. When the number of +problems gets large, the optimality gap of our proposed approach decreases +exponentially in the distance between the clusters. We explore the performance +of the proposed approach through the application of managing newsvendor systems +via numerical experiments. We investigate the impacts of distance metrics +between problem instances on the performance of the cluster-based Shrunken-SAA +approach with synthetic data. We further validate our proposed approach with +real data and highlight the advantages of cluster-based data aggregation, +especially in the small-data large-scale regime, compared to the existing +approaches. + +
+
+
+
+
+ + ☆ Accelerating DNN Training With Photonics: A Residue Number System-Based + Design + + +
+ Photonic computing is a compelling avenue for performing highly efficient +matrix multiplication, a crucial operation in Deep Neural Networks (DNNs). +While this method has shown great success in DNN inference, meeting the high +precision demands of DNN training proves challenging due to the precision +limitations imposed by costly data converters and the analog noise inherent in +photonic hardware. This paper proposes Mirage, a photonic DNN training +accelerator that overcomes the precision challenges in photonic hardware using +the Residue Number System (RNS). RNS is a numeral system based on modular +arithmetic$\unicode{x2014}$allowing us to perform high-precision operations via +multiple low-precision modular operations. In this work, we present a novel +micro-architecture and dataflow for an RNS-based photonic tensor core +performing modular arithmetic in the analog domain. By combining RNS and +photonics, Mirage provides high energy efficiency without compromising +precision and can successfully train state-of-the-art DNNs achieving accuracy +comparable to FP32 training. Our study shows that on average across several +DNNs when compared to systolic arrays, Mirage achieves more than $23.8\times$ +faster training and $32.1\times$ lower EDP in an iso-energy scenario and +consumes $42.8\times$ lower power with comparable or better EDP in an iso-area +scenario. + +
+
+
+
+
+ + ☆ Enhancing the Performance of Neural Networks Through Causal Discovery + and Integration of Domain Knowledge + + +
+ In this paper, we develop a generic methodology to encode hierarchical +causality structure among observed variables into a neural network in order to +improve its predictive performance. The proposed methodology, called +causality-informed neural network (CINN), leverages three coherent steps to +systematically map the structural causal knowledge into the layer-to-layer +design of neural network while strictly preserving the orientation of every +causal relationship. In the first step, CINN discovers causal relationships +from observational data via directed acyclic graph (DAG) learning, where causal +discovery is recast as a continuous optimization problem to avoid the +combinatorial nature. In the second step, the discovered hierarchical causality +structure among observed variables is systematically encoded into neural +network through a dedicated architecture and customized loss function. By +categorizing variables in the causal DAG as root, intermediate, and leaf nodes, +the hierarchical causal DAG is translated into CINN with a one-to-one +correspondence between nodes in the causal DAG and units in the CINN while +maintaining the relative order among these nodes. Regarding the loss function, +both intermediate and leaf nodes in the DAG graph are treated as target outputs +during CINN training so as to drive co-learning of causal relationships among +different types of nodes. As multiple loss components emerge in CINN, we +leverage the projection of conflicting gradients to mitigate gradient +interference among the multiple learning tasks. Computational experiments +across a broad spectrum of UCI data sets demonstrate substantial advantages of +CINN in predictive performance over other state-of-the-art methods. In +addition, an ablation study underscores the value of integrating structural and +quantitative causal knowledge in enhancing the neural network's predictive +performance incrementally. + +
+
+
+
+
+ + ☆ Language Models: A Guide for the Perplexed + + +
+ Given the growing importance of AI literacy, we decided to write this +tutorial to help narrow the gap between the discourse among those who study +language models -- the core technology underlying ChatGPT and similar products +-- and those who are intrigued and want to learn more about them. In short, we +believe the perspective of researchers and educators can add some clarity to +the public's understanding of the technologies beyond what's currently +available, which tends to be either extremely technical or promotional material +generated about products by their purveyors. + Our approach teases apart the concept of a language model from products built +on them, from the behaviors attributed to or desired from those products, and +from claims about similarity to human cognition. As a starting point, we (1) +offer a scientific viewpoint that focuses on questions amenable to study +through experimentation; (2) situate language models as they are today in the +context of the research that led to their development; and (3) describe the +boundaries of what is known about the models at this writing. + +
+
+
+
+
+ + ☆ Federated Fine-Tuning of Foundation Models via Probabilistic Masking + + +
+ Foundation Models (FMs) have revolutionized machine learning with their +adaptability and high performance across tasks; yet, their integration into +Federated Learning (FL) is challenging due to substantial communication +overhead from their extensive parameterization. Current communication-efficient +FL strategies, such as gradient compression, reduce bitrates to around $1$ +bit-per-parameter (bpp). However, these approaches fail to harness the +characteristics of FMs, with their large number of parameters still posing a +challenge to communication efficiency, even at these bitrate regimes. In this +work, we present DeltaMask, a novel method that efficiently fine-tunes FMs in +FL at an ultra-low bitrate, well below 1 bpp. DeltaMask employs stochastic +masking to detect highly effective subnetworks within FMs and leverage +stochasticity and sparsity in client masks to compress updates into a compact +grayscale image using probabilistic filters, deviating from traditional weight +training approaches. Our comprehensive evaluations across various datasets and +architectures demonstrate DeltaMask efficiently achieves bitrates as low as +0.09 bpp, enhancing communication efficiency while maintaining FMs performance, +as measured on 8 datasets and 5 pre-trained models of various network +architectures. + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ☆ Utilizing Model Residuals to Identify Rental Properties of Interest: The + Price Anomaly Score (PAS) and Its Application to Real-time Data in Manhattan + + +
+ Understanding whether a property is priced fairly hinders buyers and sellers +since they usually do not have an objective viewpoint of the price distribution +for the overall market of their interest. Drawing from data collected of all +possible available properties for rent in Manhattan as of September 2023, this +paper aims to strengthen our understanding of model residuals; specifically on +machine learning models which generalize for a majority of the distribution of +a well-proportioned dataset. Most models generally perceive deviations from +predicted values as mere inaccuracies, however this paper proposes a different +vantage point: when generalizing to at least 75\% of the data-set, the +remaining deviations reveal significant insights. To harness these insights, we +introduce the Price Anomaly Score (PAS), a metric capable of capturing +boundaries between irregularly predicted prices. By combining relative pricing +discrepancies with statistical significance, the Price Anomaly Score (PAS) +offers a multifaceted view of rental valuations. This metric allows experts to +identify overpriced or underpriced properties within a dataset by aggregating +PAS values, then fine-tuning upper and lower boundaries to any threshold to set +indicators of choice. + +
+
+ comment: 8 pages, 8 figures, dataset is available with DOI +
+
+
+
+
+ + ♻ ☆ Dynamic Neighborhood Construction for Structured Large Discrete Action + Spaces + + +
+ Large discrete action spaces (LDAS) remain a central challenge in +reinforcement learning. Existing solution approaches can handle unstructured +LDAS with up to a few million actions. However, many real-world applications in +logistics, production, and transportation systems have combinatorial action +spaces, whose size grows well beyond millions of actions, even on small +instances. Fortunately, such action spaces exhibit structure, e.g., equally +spaced discrete resource units. With this work, we focus on handling structured +LDAS (SLDAS) with sizes that cannot be handled by current benchmarks: we +propose Dynamic Neighborhood Construction (DNC), a novel exploitation paradigm +for SLDAS. We present a scalable neighborhood exploration heuristic that +utilizes this paradigm and efficiently explores the discrete neighborhood +around the continuous proxy action in structured action spaces with up to +$10^{73}$ actions. We demonstrate the performance of our method by benchmarking +it against three state-of-the-art approaches designed for large discrete action +spaces across two distinct environments. Our results show that DNC matches or +outperforms state-of-the-art approaches while being computationally more +efficient. Furthermore, our method scales to action spaces that so far remained +computationally intractable for existing methodologies. + +
+
+
+
+
+ + ♻ ☆ Algorithmic Assistance with Recommendation-Dependent Preferences + + +
+ When we use algorithms to produce risk assessments, we typically think of +these predictions as providing helpful input to human decisions, such as when +risk scores are presented to judges or doctors. But when a decision-maker +obtains algorithmic assistance, they may not only react to the information. The +decision-maker may view the input of the algorithm as recommending a default +action, making it costly for them to deviate, such as when a judge is reluctant +to overrule a high-risk assessment of a defendant or a doctor fears the +consequences of deviating from recommended procedures. In this article, we +propose a principal-agent model of joint human-machine decision-making. Within +this model, we consider the effect and design of algorithmic recommendations +when they affect choices not just by shifting beliefs, but also by altering +preferences. We motivate this assumption from institutional factors, such as a +desire to avoid audits, as well as from well-established models in behavioral +science that predict loss aversion relative to a reference point, which here is +set by the algorithm. We show that recommendation-dependent preferences create +inefficiencies where the decision-maker is overly responsive to the +recommendation. As a potential remedy, we discuss algorithms that strategically +withhold recommendations, and show how they can improve the quality of final +decisions. + +
+
+
+
+
+ + ♻ ☆ LibSignal: An Open Library for Traffic Signal Control NeurIPS 2022 + + +
+ This paper introduces a library for cross-simulator comparison of +reinforcement learning models in traffic signal control tasks. This library is +developed to implement recent state-of-the-art reinforcement learning models +with extensible interfaces and unified cross-simulator evaluation metrics. It +supports commonly-used simulators in traffic signal control tasks, including +Simulation of Urban MObility(SUMO) and CityFlow, and multiple benchmark +datasets for fair comparisons. We conducted experiments to validate our +implementation of the models and to calibrate the simulators so that the +experiments from one simulator could be referential to the other. Based on the +validated models and calibrated environments, this paper compares and reports +the performance of current state-of-the-art RL algorithms across different +datasets and simulators. This is the first time that these methods have been +compared fairly under the same datasets with different simulators. + +
+
+ comment: 11 pages + 6 pages appendix. Accepted by Machine Learning Journal + (2023). A short version is accepted by NeurIPS 2022 Workshop: Reinforcement + Learning for Real Life. Website: https://darl-libsignal.github.io/ +
+
+
+
+
+ + ♻ ☆ Uncertainty-aware Traffic Prediction under Missing Data ICDM 2023 + + +
+ Traffic prediction is a crucial topic because of its broad scope of +applications in the transportation domain. Recently, various studies have +achieved promising results. However, most studies assume the prediction +locations have complete or at least partial historical records and cannot be +extended to non-historical recorded locations. In real-life scenarios, the +deployment of sensors could be limited due to budget limitations and +installation availability, which makes most current models not applicable. +Though few pieces of literature tried to impute traffic states at the missing +locations, these methods need the data simultaneously observed at the locations +with sensors, making them not applicable to prediction tasks. Another drawback +is the lack of measurement of uncertainty in prediction, making prior works +unsuitable for risk-sensitive tasks or involving decision-making. To fill the +gap, inspired by the previous inductive graph neural network, this work +proposed an uncertainty-aware framework with the ability to 1) extend +prediction to missing locations with no historical records and significantly +extend spatial coverage of prediction locations while reducing deployment of +sensors and 2) generate probabilistic prediction with uncertainty +quantification to help the management of risk and decision making in the +down-stream tasks. Through extensive experiments on real-life datasets, the +result shows our method achieved promising results on prediction tasks, and the +uncertainty quantification gives consistent results which highly correlated +with the locations with and without historical data. We also show that our +model could help support sensor deployment tasks in the transportation field to +achieve higher accuracy with a limited sensor deployment budget. + +
+
+ comment: 11 pages, 3 figures, a short version of this paper is accepted by + ICDM 2023 +
+
+
+
+
+ + ♻ ☆ D-CIPHER: Discovery of Closed-form Partial Differential Equations NeurIPS 2023 + + +
+ Closed-form differential equations, including partial differential equations +and higher-order ordinary differential equations, are one of the most important +tools used by scientists to model and better understand natural phenomena. +Discovering these equations directly from data is challenging because it +requires modeling relationships between various derivatives that are not +observed in the data (equation-data mismatch) and it involves searching across +a huge space of possible equations. Current approaches make strong assumptions +about the form of the equation and thus fail to discover many well-known +systems. Moreover, many of them resolve the equation-data mismatch by +estimating the derivatives, which makes them inadequate for noisy and +infrequently sampled systems. To this end, we propose D-CIPHER, which is robust +to measurement artifacts and can uncover a new and very general class of +differential equations. We further design a novel optimization procedure, +CoLLie, to help D-CIPHER search through this class efficiently. Finally, we +demonstrate empirically that it can discover many well-known equations that are +beyond the capabilities of current methods. + +
+
+ comment: To appear in the Proceedings of the 37th Conference on Neural + Information Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ♻ ☆ Universalizing Weak Supervision ICLR 2022 + + +
+ Weak supervision (WS) frameworks are a popular way to bypass hand-labeling +large datasets for training data-hungry models. These approaches synthesize +multiple noisy but cheaply-acquired estimates of labels into a set of +high-quality pseudolabels for downstream training. However, the synthesis +technique is specific to a particular kind of label, such as binary labels or +sequences, and each new label type requires manually designing a new synthesis +algorithm. Instead, we propose a universal technique that enables weak +supervision over any label type while still offering desirable properties, +including practical flexibility, computational efficiency, and theoretical +guarantees. We apply this technique to important problems previously not +tackled by WS frameworks including learning to rank, regression, and learning +in hyperbolic space. Theoretically, our synthesis approach produces a +consistent estimators for learning some challenging but important +generalizations of the exponential family model. Experimentally, we validate +our framework and show improvement over baselines in diverse settings including +real-world learning-to-rank and regression problems along with learning on +hyperbolic manifolds. + +
+
+ comment: ICLR 2022 +
+
+
+
+
+ + ♻ ☆ Mitigating Source Bias for Fairer Weak Supervision NeurIPS 2023 + + +
+ Weak supervision enables efficient development of training sets by reducing +the need for ground truth labels. However, the techniques that make weak +supervision attractive -- such as integrating any source of signal to estimate +unknown labels -- also entail the danger that the produced pseudolabels are +highly biased. Surprisingly, given everyday use and the potential for increased +bias, weak supervision has not been studied from the point of view of fairness. +We begin such a study, starting with the observation that even when a fair +model can be built from a dataset with access to ground-truth labels, the +corresponding dataset labeled via weak supervision can be arbitrarily unfair. +To address this, we propose and empirically validate a model for source +unfairness in weak supervision, then introduce a simple counterfactual +fairness-based technique that can mitigate these biases. Theoretically, we show +that it is possible for our approach to simultaneously improve both accuracy +and fairness -- in contrast to standard fairness approaches that suffer from +tradeoffs. Empirically, we show that our technique improves accuracy on weak +supervision baselines by as much as 32\% while reducing demographic parity gap +by 82.5\%. A simple extension of our method aimed at maximizing performance +produces state-of-the-art performance in five out of ten datasets in the WRENCH +benchmark. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Guarantees for Self-Play in Multiplayer Games via Polymatrix + Decomposability NeurIPS 2023 + + +
+ Self-play is a technique for machine learning in multi-agent systems where a +learning algorithm learns by interacting with copies of itself. Self-play is +useful for generating large quantities of data for learning, but has the +drawback that the agents the learner will face post-training may have +dramatically different behavior than the learner came to expect by interacting +with itself. For the special case of two-player constant-sum games, self-play +that reaches Nash equilibrium is guaranteed to produce strategies that perform +well against any post-training opponent; however, no such guarantee exists for +multiplayer games. We show that in games that approximately decompose into a +set of two-player constant-sum games (called constant-sum polymatrix games) +where global $\epsilon$-Nash equilibria are boundedly far from Nash equilibria +in each subgame (called subgame stability), any no-external-regret algorithm +that learns by self-play will produce a strategy with bounded vulnerability. +For the first time, our results identify a structural property of multiplayer +games that enable performance guarantees for the strategies produced by a broad +class of self-play algorithms. We demonstrate our findings through experiments +on Leduc poker. + +
+
+ comment: To appear at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ BertRLFuzzer: A BERT and Reinforcement Learning based Fuzzer + + +
+ We present a novel tool BertRLFuzzer, a BERT and Reinforcement Learning (RL) +based fuzzer aimed at finding security vulnerabilities for Web applications. +BertRLFuzzer works as follows: given a set of seed inputs, the fuzzer performs +grammar-adhering and attack-provoking mutation operations on them to generate +candidate attack vectors. The key insight of BertRLFuzzer is the use of RL with +a BERT model as an agent to guide the fuzzer to efficiently learn +grammar-adhering and attack-provoking mutation operators. In order to establish +the efficacy of BertRLFuzzer we compare it against a total of 13 black box and +white box fuzzers over a benchmark of 9 victim websites with over 16K LOC. We +observed a significant improvement relative to the nearest competing tool in +terms of time to first attack (54% less), new vulnerabilities found (17 new +vulnerabilities), and attack rate (4.4% more attack vectors generated). + +
+
+
+
+
+ + ♻ ☆ Chameleon: a heterogeneous and disaggregated accelerator system for + retrieval-augmented language models + + +
+ A Retrieval-Augmented Language Model (RALM) augments a generative language +model by retrieving context-specific knowledge from an external database. This +strategy facilitates impressive text generation quality even with smaller +models, thus reducing orders of magnitude of computational demands. However, +RALMs introduce unique system design challenges due to (a) the diverse workload +characteristics between LM inference and retrieval and (b) the various system +requirements and bottlenecks for different RALM configurations such as model +sizes, database sizes, and retrieval frequencies. We propose Chameleon, a +heterogeneous accelerator system that integrates both LM and retrieval +accelerators in a disaggregated architecture. The heterogeneity ensures +efficient acceleration of both LM inference and retrieval, while the +accelerator disaggregation enables the system to independently scale both types +of accelerators to fulfill diverse RALM requirements. Our Chameleon prototype +implements retrieval accelerators on FPGAs and assigns LM inference to GPUs, +with a CPU server orchestrating these accelerators over the network. Compared +to CPU-based and CPU-GPU vector search systems, Chameleon achieves up to 23.72x +speedup and 26.2x energy efficiency. Evaluated on various RALMs, Chameleon +exhibits up to 2.16x reduction in latency and 3.18x speedup in throughput +compared to the hybrid CPU-GPU architecture. These promising results pave the +way for bringing accelerator heterogeneity and disaggregation into future RALM +systems. + +
+
+
+
+
+ + ♻ ☆ Corruption-Robust Lipschitz Contextual Search + + +
+ I study the problem of learning a Lipschitz function with corrupted binary +signals. The learner tries to learn a $L$-Lipschitz function $f: [0,1]^d +\rightarrow [0, L]$ that the adversary chooses. There is a total of $T$ rounds. +In each round $t$, the adversary selects a context vector $x_t$ in the input +space, and the learner makes a guess to the true function value $f(x_t)$ and +receives a binary signal indicating whether the guess is high or low. In a +total of $C$ rounds, the signal may be corrupted, though the value of $C$ is +\emph{unknown} to the learner. The learner's goal is to incur a small +cumulative loss. This work introduces the new algorithmic technique +\emph{agnostic checking} as well as new analysis techniques. I design +algorithms which: for the symmetric loss, the learner achieves regret $L\cdot +O(C\log T)$ with $d = 1$ and $L\cdot O_d(C\log T + T^{(d-1)/d})$ with $d > 1$; +for the pricing loss, the learner achieves regret $L\cdot \widetilde{O} +(T^{d/(d+1)} + C\cdot T^{1/(d+1)})$. + +
+
+
+
+
+ + ♻ ☆ Fast and Expressive Gesture Recognition using a Combination-Homomorphic + Electromyogram Encoder + + +
+ We study the task of gesture recognition from electromyography (EMG), with +the goal of enabling expressive human-computer interaction at high accuracy, +while minimizing the time required for new subjects to provide calibration +data. To fulfill these goals, we define combination gestures consisting of a +direction component and a modifier component. New subjects only demonstrate the +single component gestures and we seek to extrapolate from these to all possible +single or combination gestures. We extrapolate to unseen combination gestures +by combining the feature vectors of real single gestures to produce synthetic +training data. This strategy allows us to provide a large and flexible gesture +vocabulary, while not requiring new subjects to demonstrate combinatorially +many example gestures. We pre-train an encoder and a combination operator using +self-supervision, so that we can produce useful synthetic training data for +unseen test subjects. To evaluate the proposed method, we collect a real-world +EMG dataset, and measure the effect of augmented supervision against two +baselines: a partially-supervised model trained with only single gesture data +from the unseen subject, and a fully-supervised model trained with real single +and real combination gesture data from the unseen subject. We find that the +proposed method provides a dramatic improvement over the partially-supervised +model, and achieves a useful classification accuracy that in some cases +approaches the performance of the fully-supervised model. + +
+
+ comment: 24 pages, 7 figures, 6 tables V2: add link to code, fix bibliography +
+
+
+
+
+ + ♻ ☆ CoLA: Exploiting Compositional Structure for Automatic and Efficient + Numerical Linear Algebra NeurIPS 2023 + + +
+ Many areas of machine learning and science involve large linear algebra +problems, such as eigendecompositions, solving linear systems, computing matrix +exponentials, and trace estimation. The matrices involved often have Kronecker, +convolutional, block diagonal, sum, or product structure. In this paper, we +propose a simple but general framework for large-scale linear algebra problems +in machine learning, named CoLA (Compositional Linear Algebra). By combining a +linear operator abstraction with compositional dispatch rules, CoLA +automatically constructs memory and runtime efficient numerical algorithms. +Moreover, CoLA provides memory efficient automatic differentiation, low +precision computation, and GPU acceleration in both JAX and PyTorch, while also +accommodating new objects, operations, and rules in downstream packages via +multiple dispatch. CoLA can accelerate many algebraic operations, while making +it easy to prototype matrix structures and algorithms, providing an appealing +drop-in tool for virtually any computational effort that requires linear +algebra. We showcase its efficacy across a broad range of applications, +including partial differential equations, Gaussian processes, equivariant model +construction, and unsupervised learning. + +
+
+ comment: Code available at https://github.com/wilson-labs/cola. NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Equivariant Parameter Sharing for Porous Crystalline Materials + + +
+ Efficiently predicting properties of porous crystalline materials has great +potential to accelerate the high throughput screening process for developing +new materials, as simulations carried out using first principles model are +often computationally expensive. To effectively make use of Deep Learning +methods to model these materials, we need to utilize the symmetries present in +the crystals, which are defined by their space group. Existing methods for +crystal property prediction either have symmetry constraints that are too +restrictive or only incorporate symmetries between unit cells. In addition, +these models do not explicitly model the porous structure of the crystal. In +this paper, we develop a model which incorporates the symmetries of the unit +cell of a crystal in its architecture and explicitly models the porous +structure. We evaluate our model by predicting the heat of adsorption of CO$_2$ +for different configurations of the mordenite zeolite. Our results confirm that +our method performs better than existing methods for crystal property +prediction and that the inclusion of pores results in a more efficient model. + +
+
+ comment: Additional results +
+
+
+
+
+ + ♻ ☆ Physics-informed neural networks for transformed geometries and + manifolds + + +
+ Physics-informed neural networks (PINNs) effectively embed physical +principles into machine learning, but often struggle with complex or +alternating geometries. We propose a novel method for integrating geometric +transformations within PINNs to robustly accommodate geometric variations. Our +method incorporates a diffeomorphism as a mapping of a reference domain and +adapts the derivative computation of the physics-informed loss function. This +generalizes the applicability of PINNs not only to smoothly deformed domains, +but also to lower-dimensional manifolds and allows for direct shape +optimization while training the network. We demonstrate the effectivity of our +approach on several problems: (i) Eikonal equation on Archimedean spiral, (ii) +Poisson problem on surface manifold, (iii) Incompressible Stokes flow in +deformed tube, and (iv) Shape optimization with Laplace operator. Through these +examples, we demonstrate the enhanced flexibility over traditional PINNs, +especially under geometric variations. The proposed framework presents an +outlook for training deep neural operators over parametrized geometries, paving +the way for advanced modeling with PDEs on complex geometries in science and +engineering. + +
+
+
+
+
+ + ♻ ☆ SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction + + +
+ 3D occupancy prediction is an important task for the robustness of +vision-centric autonomous driving, which aims to predict whether each point is +occupied in the surrounding 3D space. Existing methods usually require 3D +occupancy labels to produce meaningful results. However, it is very laborious +to annotate the occupancy status of each voxel. In this paper, we propose +SelfOcc to explore a self-supervised way to learn 3D occupancy using only video +sequences. We first transform the images into the 3D space (e.g., bird's eye +view) to obtain 3D representation of the scene. We directly impose constraints +on the 3D representations by treating them as signed distance fields. We can +then render 2D images of previous and future frames as self-supervision signals +to learn the 3D representations. We propose an MVS-embedded strategy to +directly optimize the SDF-induced weights with multiple depth proposals. Our +SelfOcc outperforms the previous best method SceneRF by 58.7% using a single +frame as input on SemanticKITTI and is the first self-supervised work that +produces reasonable 3D occupancy for surround cameras on nuScenes. SelfOcc +produces high-quality depth and achieves state-of-the-art results on novel +depth synthesis, monocular depth estimation, and surround-view depth estimation +on the SemanticKITTI, KITTI-2015, and nuScenes, respectively. Code: +https://github.com/huang-yh/SelfOcc. + +
+
+ comment: Code is available at: https://github.com/huang-yh/SelfOcc +
+
+
+
+
+ + ♻ ☆ An Attribution Method for Siamese Encoders EMNLP'23 + + +
+ Despite the success of Siamese encoder models such as sentence transformers +(ST), little is known about the aspects of inputs they pay attention to. A +barrier is that their predictions cannot be attributed to individual features, +as they compare two inputs rather than processing a single one. This paper +derives a local attribution method for Siamese encoders by generalizing the +principle of integrated gradients to models with multiple inputs. The solution +takes the form of feature-pair attributions, and can be reduced to a +token-token matrix for STs. Our method involves the introduction of integrated +Jacobians and inherits the advantageous formal properties of integrated +gradients: it accounts for the model's full computation graph and is guaranteed +to converge to the actual prediction. A pilot study shows that in an ST few +token-pairs can often explain large fractions of predictions, and it focuses on +nouns and verbs. For accurate predictions, it however needs to attend to the +majority of tokens and parts of speech. + +
+
+ comment: Accepted to EMNLP'23 +
+
+
+
+
+ + ♻ ☆ Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of + Text-To-Image Models + + +
+ Text-To-Image (TTI) models, such as DALL-E and StableDiffusion, have +demonstrated remarkable prompt-based image generation capabilities. +Multilingual encoders may have a substantial impact on the cultural agency of +these models, as language is a conduit of culture. In this study, we explore +the cultural perception embedded in TTI models by characterizing culture across +three hierarchical tiers: cultural dimensions, cultural domains, and cultural +concepts. Based on this ontology, we derive prompt templates to unlock the +cultural knowledge in TTI models, and propose a comprehensive suite of +evaluation techniques, including intrinsic evaluations using the CLIP space, +extrinsic evaluations with a Visual-Question-Answer (VQA) model and human +assessments, to evaluate the cultural content of TTI-generated images. To +bolster our research, we introduce the CulText2I dataset, derived from four +diverse TTI models and spanning ten languages. Our experiments provide insights +regarding Do, What, Which and How research questions about the nature of +cultural encoding in TTI models, paving the way for cross-cultural applications +of these models. + +
+
+
+
+
+ + ♻ ☆ Rigorous dynamical mean field theory for stochastic gradient descent + methods + + +
+ We prove closed-form equations for the exact high-dimensional asymptotics of +a family of first order gradient-based methods, learning an estimator (e.g. +M-estimator, shallow neural network, ...) from observations on Gaussian data +with empirical risk minimization. This includes widely used algorithms such as +stochastic gradient descent (SGD) or Nesterov acceleration. The obtained +equations match those resulting from the discretization of dynamical mean-field +theory (DMFT) equations from statistical physics when applied to gradient flow. +Our proof method allows us to give an explicit description of how memory +kernels build up in the effective dynamics, and to include non-separable update +functions, allowing datasets with non-identity covariance matrices. Finally, we +provide numerical implementations of the equations for SGD with generic +extensive batch-size and with constant learning rates. + +
+
+ comment: 40 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Learning End-to-End Channel Coding with Diffusion Models SC + + +
+ It is a known problem that deep-learning-based end-to-end (E2E) channel +coding systems depend on a known and differentiable channel model, due to the +learning process and based on the gradient-descent optimization methods. This +places the challenge to approximate or generate the channel or its derivative +from samples generated by pilot signaling in real-world scenarios. Currently, +there are two prevalent methods to solve this problem. One is to generate the +channel via a generative adversarial network (GAN), and the other is to, in +essence, approximate the gradient via reinforcement learning methods. Other +methods include using score-based methods, variational autoencoders, or +mutual-information-based methods. In this paper, we focus on generative models +and, in particular, on a new promising method called diffusion models, which +have shown a higher quality of generation in image-based tasks. We will show +that diffusion models can be used in wireless E2E scenarios and that they work +as good as Wasserstein GANs while having a more stable training procedure and a +better generalization ability in testing. + +
+
+ comment: 6 pages, WSA/SCC 2023 +
+
+
+
+
+ + ♻ ☆ Compressing the Backward Pass of Large-Scale Neural Architectures by + Structured Activation Pruning + + +
+ The rise of Deep Neural Networks (DNNs) has led to an increase in model size +and complexity, straining the memory capacity of GPUs. Sparsity in DNNs, +characterized as structural or ephemeral, has gained attention as a solution. +This work focuses on ephemeral sparsity, aiming to reduce memory consumption +during training. It emphasizes the significance of activations, an often +overlooked component, and their role in memory usage. This work employs +structured pruning in Block Sparse Compressed Row (BSR) format in combination +with a magnitude-based criterion to efficiently prune activations. We +furthermore introduce efficient block-sparse operators for GPUs and showcase +their effectiveness, as well as the superior compression offered by block +sparsity. We report the effectiveness of activation pruning by evaluating +training speed, accuracy, and memory usage of large-scale neural architectures +on the example of ResMLP on image classification tasks. As a result, we observe +a memory reduction of up to 32% while maintaining accuracy. Ultimately, our +approach aims to democratize large-scale model training, reduce GPU +requirements, and address ecological concerns. + +
+
+ comment: 8 pages, 11 figures, submitted to the 6th AccML workshop at HiPEAC + conference 2024 +
+
+
+
+
+ + ♻ ☆ FedAgg: Adaptive Federated Learning with Aggregated Gradients + + +
+ Federated Learning (FL) has become an emerging norm for distributed model +training, which enables multiple devices cooperatively to train a shared model +utilizing their own datasets scheduled by a central server while keeping +private data localized. However, during the training process, the +non-independent-and-identically-distributed (Non-IID) data generated on +heterogeneous clients and frequent communication across participants may +significantly influence the training performance, slow down the convergent +rate, and increase communication consumption. In this paper, we ameliorate the +standard stochastic gradient descent approach by introducing the aggregated +gradients at each local update epoch and propose an adaptive learning rate +iterative algorithm that further takes the deviation between the local +parameter and global parameter into account. The aforementioned adaptive +learning rate design mechanism requires local information of all clients, which +is challenging as there is no communication during the local update epochs. To +obtain a decentralized adaptive learning rate for each client, we introduce the +mean-field approach by utilizing two mean-field terms to estimate the average +local parameters and gradients respectively without exchanging clients' local +information with each other over time. Through theoretical analysis, we prove +that our method can provide the convergence guarantee for model training and +derive a convergent upper bound for the client drifting term. Extensive +numerical results show that our proposed framework is superior to the +state-of-the-art FL schemes in both model accuracy and convergent rate on +real-world datasets with IID and Non-IID data distribution. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks + + +
+ Despite efforts to align large language models (LLMs) with human values, +widely-used LLMs such as GPT, Llama, Claude, and PaLM are susceptible to +jailbreaking attacks, wherein an adversary fools a targeted LLM into generating +objectionable content. To address this vulnerability, we propose SmoothLLM, the +first algorithm designed to mitigate jailbreaking attacks on LLMs. Based on our +finding that adversarially-generated prompts are brittle to character-level +changes, our defense first randomly perturbs multiple copies of a given input +prompt, and then aggregates the corresponding predictions to detect adversarial +inputs. SmoothLLM reduces the attack success rate on numerous popular LLMs to +below one percentage point, avoids unnecessary conservatism, and admits +provable guarantees on attack mitigation. Moreover, our defense uses +exponentially fewer queries than existing attacks and is compatible with any +LLM. Our code is publicly available at the following link: +https://github.com/arobey1/smooth-llm. + +
+
+
+
+
+ + ♻ ☆ Towards Learning Monocular 3D Object Localization From 2D Labels using + the Physical Laws of Motion + + +
+ We present a novel method for precise 3D object localization in single images +from a single calibrated camera using only 2D labels. No expensive 3D labels +are needed. Thus, instead of using 3D labels, our model is trained with +easy-to-annotate 2D labels along with the physical knowledge of the object's +motion. Given this information, the model can infer the latent third dimension, +even though it has never seen this information during training. Our method is +evaluated on both synthetic and real-world datasets, and we are able to achieve +a mean distance error of just 6 cm in our experiments on real data. The results +indicate the method's potential as a step towards learning 3D object location +estimation, where collecting 3D data for training is not feasible. + +
+
+
+
+
+ + ♻ ☆ GC-MVSNet: Multi-View, Multi-Scale, Geometrically-Consistent Multi-View + Stereo WACV 2024 + + +
+ Traditional multi-view stereo (MVS) methods rely heavily on photometric and +geometric consistency constraints, but newer machine learning-based MVS methods +check geometric consistency across multiple source views only as a +post-processing step. In this paper, we present a novel approach that +explicitly encourages geometric consistency of reference view depth maps across +multiple source views at different scales during learning (see Fig. 1). We find +that adding this geometric consistency loss significantly accelerates learning +by explicitly penalizing geometrically inconsistent pixels, reducing the +training iteration requirements to nearly half that of other MVS methods. Our +extensive experiments show that our approach achieves a new state-of-the-art on +the DTU and BlendedMVS datasets, and competitive results on the Tanks and +Temples benchmark. To the best of our knowledge, GC-MVSNet is the first attempt +to enforce multi-view, multi-scale geometric consistency during learning. + +
+
+ comment: Accepted in WACV 2024 +
+
+
+
+
+ + ♻ ☆ FASER: Binary Code Similarity Search through the use of Intermediate + Representations + + +
+ Being able to identify functions of interest in cross-architecture software +is useful whether you are analysing for malware, securing the software supply +chain or conducting vulnerability research. Cross-Architecture Binary Code +Similarity Search has been explored in numerous studies and has used a wide +range of different data sources to achieve its goals. The data sources +typically used draw on common structures derived from binaries such as function +control flow graphs or binary level call graphs, the output of the disassembly +process or the outputs of a dynamic analysis approach. One data source which +has received less attention is binary intermediate representations. Binary +Intermediate representations possess two interesting properties: they are cross +architecture by their very nature and encode the semantics of a function +explicitly to support downstream usage. Within this paper we propose Function +as a String Encoded Representation (FASER) which combines long document +transformers with the use of intermediate representations to create a model +capable of cross architecture function search without the need for manual +feature engineering, pre-training or a dynamic analysis step. We compare our +approach against a series of baseline approaches for two tasks; A general +function search task and a targeted vulnerability search task. Our approach +demonstrates strong performance across both tasks, performing better than all +baseline approaches. + +
+
+ comment: 10 pages, Proceedings of the Conference on Applied Machine Learning + in Information Security (CAMLIS) +
+
+
+
+
+ + ♻ ☆ A sparse coding approach to inverse problems with application to + microwave tomography + + +
+ Inverse imaging problems that are ill-posed can be encountered across +multiple domains of science and technology, ranging from medical diagnosis to +astronomical studies. To reconstruct images from incomplete and distorted data, +it is necessary to create algorithms that can take into account both, the +physical mechanisms responsible for generating these measurements and the +intrinsic characteristics of the images being analyzed. In this work, the +sparse representation of images is reviewed, which is a realistic, compact and +effective generative model for natural images inspired by the visual system of +mammals. It enables us to address ill-posed linear inverse problems by training +the model on a vast collection of images. Moreover, we extend the application +of sparse coding to solve the non-linear and ill-posed problem in microwave +tomography imaging, which could lead to a significant improvement of the +state-of-the-arts algorithms. + +
+
+ comment: submitted to RevMexAA (conference series) +
+
+
+
+
+ + ♻ ☆ Intellectual Property Protection of Diffusion Models via the Watermark + Diffusion Process + + +
+ Diffusion models have rapidly become a vital part of deep generative +architectures, given today's increasing demands. Obtaining large, +high-performance diffusion models demands significant resources, highlighting +their importance as intellectual property worth protecting. However, existing +watermarking techniques for ownership verification are insufficient when +applied to diffusion models. Very recent research in watermarking diffusion +models either exposes watermarks during task generation, which harms the +imperceptibility, or is developed for conditional diffusion models that require +prompts to trigger the watermark. This paper introduces WDM, a novel +watermarking solution for diffusion models without imprinting the watermark +during task generation. It involves training a model to concurrently learn a +Watermark Diffusion Process (WDP) for embedding watermarks alongside the +standard diffusion process for task generation. We provide a detailed +theoretical analysis of WDP training and sampling, relating it to a shifted +Gaussian diffusion process via the same reverse noise. Extensive experiments +are conducted to validate the effectiveness and robustness of our approach in +various trigger and watermark data configurations. + +
+
+
+
+
+ + ♻ ☆ Human Choice Prediction in Language-based Non-Cooperative Games: + Simulation-based Off-Policy Evaluation + + +
+ Persuasion games have been fundamental in economics and AI research, and have +significant practical applications. Recent works in this area have started to +incorporate natural language, moving beyond the traditional stylized message +setting. However, previous research has focused on on-policy prediction, where +the train and test data have the same distribution, which is not representative +of real-life scenarios. In this paper, we tackle the challenging problem of +off-policy evaluation (OPE) in language-based persuasion games. To address the +inherent difficulty of human data collection in this setup, we propose a novel +approach which combines real and simulated human-bot interaction data. Our +simulated data is created by an exogenous model assuming decision makers (DMs) +start with a mixture of random and decision-theoretic based behaviors and +improve over time. We present a deep learning training algorithm that +effectively integrates real interaction and simulated data, substantially +improving over models that train only with interaction data. Our results +demonstrate the potential of real interaction and simulation mixtures as a +cost-effective and scalable solution for OPE in language-based persuasion +games. Our code and the large dataset we collected and generated are submitted +as supplementary material and publicly available in our GitHub repository: +https://github.com/eilamshapira/HumanChoicePrediction + +
+
+
+
+
+ + ♻ ☆ B-LSTM-MIONet: Bayesian LSTM-based Neural Operators for Learning the + Response of Complex Dynamical Systems to Length-Variant Multiple Input + Functions + + +
+ Deep Operator Network (DeepONet) is a neural network framework for learning +nonlinear operators such as those from ordinary differential equations (ODEs) +describing complex systems. Multiple-input deep neural operators (MIONet) +extended DeepONet to allow multiple input functions in different Banach spaces. +MIONet offers flexibility in training dataset grid spacing, without constraints +on output location. However, it requires offline inputs and cannot handle +varying sequence lengths in testing datasets, limiting its real-time +application in dynamic complex systems. This work redesigns MIONet, integrating +Long Short Term Memory (LSTM) to learn neural operators from time-dependent +data. This approach overcomes data discretization constraints and harnesses +LSTM's capability with variable-length, real-time data. Factors affecting +learning performance, like algorithm extrapolation ability are presented. The +framework is enhanced with uncertainty quantification through a novel Bayesian +method, sampling from MIONet parameter distributions. Consequently, we develop +the B-LSTM-MIONet, incorporating LSTM's temporal strengths with Bayesian +robustness, resulting in a more precise and reliable model for noisy datasets. + +
+
+
+
+
+ + ♻ ☆ Modular Neural Networks for Time Series Forecasting: Interpretability + and Feature Selection using Attention + + +
+ Multivariate time series have many applications, from healthcare and +meteorology to life science. Although deep learning models have shown excellent +predictive performance for time series, they have been criticised for being +"black-boxes" or non-interpretable. This paper proposes a novel modular neural +network model for multivariate time series prediction that is interpretable by +construction. A recurrent neural network learns the temporal dependencies in +the data while an attention-based feature selection component selects the most +relevant features and suppresses redundant features used in the learning of the +temporal dependencies. A modular deep network is trained from the selected +features independently to show the users how features influence outcomes, +making the model interpretable. Experimental results show that this approach +can outperform state-of-the-art interpretable Neural Additive Models (NAM) and +variations thereof in both regression and classification of time series tasks, +achieving a predictive performance that is comparable to the top +non-interpretable methods for time series, LSTM and XGBoost. + +
+
+
+
+
+ + ♻ ☆ DTW+S: Shape-based Comparison of Time-series with Ordered Local Trend + + +
+ Measuring distance or similarity between time-series data is a fundamental +aspect of many applications including classification, clustering, and +ensembling/alignment. Existing measures may fail to capture similarities among +local trends (shapes) and may even produce misleading results. Our goal is to +develop a measure that looks for similar trends occurring around similar times +and is easily interpretable for researchers in applied domains. This is +particularly useful for applications where time-series have a sequence of +meaningful local trends that are ordered, such as in epidemics (a surge to an +increase to a peak to a decrease). We propose a novel measure, DTW+S, which +creates an interpretable "closeness-preserving" matrix representation of the +time-series, where each column represents local trends, and then it applies +Dynamic Time Warping to compute distances between these matrices. We present a +theoretical analysis that supports the choice of this representation. We +demonstrate the utility of DTW+S in several tasks. For the clustering of +epidemic curves, we show that DTW+S is the only measure able to produce good +clustering compared to the baselines. For ensemble building, we propose a +combination of DTW+S and barycenter averaging that results in the best +preservation of characteristics of the underlying trajectories. We also +demonstrate that our approach results in better classification compared to +Dynamic Time Warping for a class of datasets, particularly when local trends +rather than scale play a decisive role. + +
+
+ comment: 11 pages, 11 figures Update: Included barycenter averaging with DTW+S + along with results +
+
+
+
+
+ + ♻ ☆ Uncertainty Quantification in Neural-Network Based Pain Intensity + Estimation + + +
+ Improper pain management can lead to severe physical or mental consequences, +including suffering, and an increased risk of opioid dependency. Assessing the +presence and severity of pain is imperative to prevent such outcomes and +determine the appropriate intervention. However, the evaluation of pain +intensity is challenging because different individuals experience pain +differently. To overcome this, researchers have employed machine learning +models to evaluate pain intensity objectively. However, these efforts have +primarily focused on point estimation of pain, disregarding the inherent +uncertainty and variability present in the data and model. Consequently, the +point estimates provide only partial information for clinical decision-making. +This study presents a neural network-based method for objective pain interval +estimation, incorporating uncertainty quantification. This work explores three +algorithms: the bootstrap method, lower and upper bound estimation (LossL) +optimized by genetic algorithm, and modified lower and upper bound estimation +(LossS) optimized by gradient descent algorithm. Our empirical results reveal +that LossS outperforms the other two by providing a narrower prediction +interval. As LossS outperforms, we assessed its performance in three different +scenarios for pain assessment: (1) a generalized approach (single model for the +entire population), (2) a personalized approach (separate model for each +individual), and (3) a hybrid approach (separate model for each cluster of +individuals). Our findings demonstrate the hybrid approach's superior +performance, with notable practicality in clinical contexts. It has the +potential to be a valuable tool for clinicians, enabling objective pain +intensity assessment while taking uncertainty into account. This capability is +crucial in facilitating effective pain management and reducing the risks +associated with improper treatment. + +
+
+ comment: 26 pages, 5 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Effective Learning with Node Perturbation in Deep Neural Networks + + +
+ Backpropagation (BP) is the dominant and most successful method for training +parameters of deep neural network models. However, BP relies on two +computationally distinct phases, does not provide a satisfactory explanation of +biological learning, and can be challenging to apply for training of networks +with discontinuities or noisy node dynamics. By comparison, node perturbation +(NP) proposes learning by the injection of noise into the network activations, +and subsequent measurement of the induced loss change. NP relies on two forward +(inference) passes, does not make use of network derivatives, and has been +proposed as a model for learning in biological systems. However, standard NP is +highly data inefficient and unstable due to its unguided noise-based search +process. In this work, we investigate different formulations of NP and relate +it to the concept of directional derivatives as well as combining it with a +decorrelating mechanism for layer-wise inputs. We find that a closer alignment +with directional derivatives together with input decorrelation at every layer +significantly enhances performance of NP learning, making its performance on +the train set competitive with BP and allowing its application to noisy systems +in which the noise process itself is inaccessible. + +
+
+
+
+
+ + ♻ ☆ Distill Gold from Massive Ores: Efficient Dataset Distillation via + Critical Samples Selection + + +
+ Data-efficient learning has garnered significant attention, especially given +the current trend of large multi-modal models. Recently, dataset distillation +becomes an effective approach for data-efficiency; however, the distillation +process itself can still be inefficient. In this work, we model the dataset +distillation task within the context of information transport. By observing the +substantial data redundancy inherent in the distillation, we argue to put more +emphasis on the samples' utility for the distillation task. We introduce and +validate a family of data utility estimators and optimal data selection methods +to exploit the most valuable samples. This strategy significantly reduces the +training costs and extends various existing distillation algorithms to larger +and more diversified datasets, e.g., in some cases only 0.04% training data is +sufficient for comparable distillation performance. Our method consistently +enhances the distillation algorithms, even on much larger-scale and more +heterogeneous datasets, e.g. ImageNet-1K and Kinetics-400. This paradigm opens +up new avenues in the dynamics of distillation and paves the way for efficient +dataset distillation. Our code is available on +https://github.com/silicx/GoldFromOres . + +
+
+
+
+
+ + ♻ ☆ Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as + an Alternative to Attention Layers in Transformers AAAI24 + + +
+ This work presents an analysis of the effectiveness of using standard shallow +feed-forward networks to mimic the behavior of the attention mechanism in the +original Transformer model, a state-of-the-art architecture for +sequence-to-sequence tasks. We substitute key elements of the attention +mechanism in the Transformer with simple feed-forward networks, trained using +the original components via knowledge distillation. Our experiments, conducted +on the IWSLT2017 dataset, reveal the capacity of these "attentionless +Transformers" to rival the performance of the original architecture. Through +rigorous ablation studies, and experimenting with various replacement network +types and sizes, we offer insights that support the viability of our approach. +This not only sheds light on the adaptability of shallow feed-forward networks +in emulating attention mechanisms but also underscores their potential to +streamline complex architectures for sequence-to-sequence tasks. + +
+
+ comment: Accepted at AAAI24(https://aaai.org/aaai-conference/) +
+
+
+
+
+ + ♻ ☆ SVDinsTN: A Tensor Network Paradigm for Efficient Structure Search from + Regularized Modeling Perspective + + +
+ Tensor network (TN) representation is a powerful technique for computer +vision and machine learning. TN structure search (TN-SS) aims to search for a +customized structure to achieve a compact representation, which is a +challenging NP-hard problem. Recent "sampling-evaluation-based" methods require +sampling an extensive collection of structures and evaluating them one by one, +resulting in prohibitively high computational costs. To address this issue, we +propose a novel TN paradigm, named SVD-inspired TN decomposition (SVDinsTN), +which allows us to efficiently solve the TN-SS problem from a regularized +modeling perspective, eliminating the repeated structure evaluations. To be +specific, by inserting a diagonal factor for each edge of the fully-connected +TN, SVDinsTN allows us to calculate TN cores and diagonal factors +simultaneously, with the factor sparsity revealing a compact TN structure. In +theory, we prove a convergence guarantee for the proposed method. Experimental +results demonstrate that the proposed method achieves approximately 100 to 1000 +times acceleration compared to the state-of-the-art TN-SS methods while +maintaining a comparable representation ability. + +
+
+
+
+
+ + ♻ ☆ Modern Bayesian Experimental Design + + +
+ Bayesian experimental design (BED) provides a powerful and general framework +for optimizing the design of experiments. However, its deployment often poses +substantial computational challenges that can undermine its practical use. In +this review, we outline how recent advances have transformed our ability to +overcome these challenges and thus utilize BED effectively, before discussing +some key areas for future development in the field. + +
+
+ comment: Accepted for publication in Statistical Science +
+
+
+
+
+ + ♻ ☆ CD-GAN: a robust fusion-based generative adversarial network for + unsupervised remote sensing change detection with heterogeneous sensors + + +
+ In the context of Earth observation, change detection boils down to comparing +images acquired at different times by sensors of possibly different spatial +and/or spectral resolutions or different modalities (e.g., optical or radar). +Even when considering only optical images, this task has proven to be +challenging as soon as the sensors differ by their spatial and/or spectral +resolutions. This paper proposes a novel unsupervised change detection method +dedicated to images acquired by such so-called heterogeneous optical sensors. +It capitalizes on recent advances which formulate the change detection task +into a robust fusion framework. Adopting this formulation, the work reported in +this paper shows that any off-the-shelf network trained beforehand to fuse +optical images of different spatial and/or spectral resolutions can be easily +complemented with a network of the same architecture and embedded into an +adversarial framework to perform change detection. A comparison with +state-of-the-art change detection methods demonstrates the versatility and the +effectiveness of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey on Distributed Training of Graph Neural Networks + + +
+ Graph neural networks (GNNs) have been demonstrated to be a powerful +algorithmic model in broad application fields for their effectiveness in +learning over graphs. To scale GNN training up for large-scale and ever-growing +graphs, the most promising solution is distributed training which distributes +the workload of training across multiple computing nodes. At present, the +volume of related research on distributed GNN training is exceptionally vast, +accompanied by an extraordinarily rapid pace of publication. Moreover, the +approaches reported in these studies exhibit significant divergence. This +situation poses a considerable challenge for newcomers, hindering their ability +to grasp a comprehensive understanding of the workflows, computational +patterns, communication strategies, and optimization techniques employed in +distributed GNN training. As a result, there is a pressing need for a survey to +provide correct recognition, analysis, and comparisons in this field. In this +paper, we provide a comprehensive survey of distributed GNN training by +investigating various optimization techniques used in distributed GNN training. +First, distributed GNN training is classified into several categories according +to their workflows. In addition, their computational patterns and communication +patterns, as well as the optimization techniques proposed by recent work are +introduced. Second, the software frameworks and hardware platforms of +distributed GNN training are also introduced for a deeper understanding. Third, +distributed GNN training is compared with distributed training of deep neural +networks, emphasizing the uniqueness of distributed GNN training. Finally, +interesting issues and opportunities in this field are discussed. + +
+
+ comment: To Appear in Proceedings of the IEEE +
+
+
+
+
+ + ♻ ☆ Knockoffs-SPR: Clean Sample Selection in Learning with Noisy Labels + + +
+ A noisy training set usually leads to the degradation of the generalization +and robustness of neural networks. In this paper, we propose a novel +theoretically guaranteed clean sample selection framework for learning with +noisy labels. Specifically, we first present a Scalable Penalized Regression +(SPR) method, to model the linear relation between network features and one-hot +labels. In SPR, the clean data are identified by the zero mean-shift parameters +solved in the regression model. We theoretically show that SPR can recover +clean data under some conditions. Under general scenarios, the conditions may +be no longer satisfied; and some noisy data are falsely selected as clean data. +To solve this problem, we propose a data-adaptive method for Scalable Penalized +Regression with Knockoff filters (Knockoffs-SPR), which is provable to control +the False-Selection-Rate (FSR) in the selected clean data. To improve the +efficiency, we further present a split algorithm that divides the whole +training set into small pieces that can be solved in parallel to make the +framework scalable to large datasets. While Knockoffs-SPR can be regarded as a +sample selection module for a standard supervised training pipeline, we further +combine it with a semi-supervised algorithm to exploit the support of noisy +data as unlabeled data. Experimental results on several benchmark datasets and +real-world noisy datasets show the effectiveness of our framework and validate +the theoretical results of Knockoffs-SPR. Our code and pre-trained models are +available at https://github.com/Yikai-Wang/Knockoffs-SPR. + +
+
+ comment: update: final version, to appear in TPAMI +
+
+
+
+
+ + ♻ ☆ Learning to Estimate Without Bias + + +
+ The Gauss Markov theorem states that the weighted least squares estimator is +a linear minimum variance unbiased estimation (MVUE) in linear models. In this +paper, we take a first step towards extending this result to non linear +settings via deep learning with bias constraints. The classical approach to +designing non-linear MVUEs is through maximum likelihood estimation (MLE) which +often involves computationally challenging optimizations. On the other hand, +deep learning methods allow for non-linear estimators with fixed computational +complexity. Learning based estimators perform optimally on average with respect +to their training set but may suffer from significant bias in other parameters. +To avoid this, we propose to add a simple bias constraint to the loss function, +resulting in an estimator we refer to as Bias Constrained Estimator (BCE). We +prove that this yields asymptotic MVUEs that behave similarly to the classical +MLEs and asymptotically attain the Cramer Rao bound. We demonstrate the +advantages of our approach in the context of signal to noise ratio estimation +as well as covariance estimation. A second motivation to BCE is in applications +where multiple estimates of the same unknown are averaged for improved +performance. Examples include distributed sensor networks and data augmentation +in test-time. In such applications, we show that BCE leads to asymptotically +consistent estimators. + +
+
+
+
+
+ + ♻ ☆ Unsupervised approaches based on optimal transport and convex analysis + for inverse problems in imaging + + +
+ Unsupervised deep learning approaches have recently become one of the crucial +research areas in imaging owing to their ability to learn expressive and +powerful reconstruction operators even when paired high-quality training data +is scarcely available. In this chapter, we review theoretically principled +unsupervised learning schemes for solving imaging inverse problems, with a +particular focus on methods rooted in optimal transport and convex analysis. We +begin by reviewing the optimal transport-based unsupervised approaches such as +the cycle-consistency-based models and learned adversarial regularization +methods, which have clear probabilistic interpretations. Subsequently, we give +an overview of a recent line of works on provably convergent learned +optimization algorithms applied to accelerate the solution of imaging inverse +problems, alongside their dedicated unsupervised training schemes. We also +survey a number of provably convergent plug-and-play algorithms (based on +gradient-step deep denoisers), which are among the most important and widely +applied unsupervised approaches for imaging problems. At the end of this +survey, we provide an overview of a few related unsupervised learning +frameworks that complement our focused schemes. Together with a detailed +survey, we provide an overview of the key mathematical results that underlie +the methods reviewed in the chapter to keep our discussion self-contained. + +
+
+
+
+
+ + ♻ ☆ Hausdorff Distance Matching with Adaptive Query Denoising for Rotated + Detection Transformer + + +
+ The Detection Transformer (DETR) has emerged as a pivotal role in object +detection tasks, setting new performance benchmarks due to its end-to-end +design and scalability. Despite its advancements, the application of DETR in +detecting rotated objects has demonstrated suboptimal performance relative to +established oriented object detectors. Our analysis identifies a key +limitation: the L1 cost used in Hungarian Matching leads to duplicate +predictions due to the square-like problem in oriented object detection, +thereby obstructing the training process of the detector. We introduce a +Hausdorff distance-based cost for Hungarian matching, which more accurately +quantifies the discrepancy between predictions and ground truths. Moreover, we +note that a static denoising approach hampers the training of rotated DETR, +particularly when the detector's predictions surpass the quality of noised +ground truths. We propose an adaptive query denoising technique, employing +Hungarian matching to selectively filter out superfluous noised queries that no +longer contribute to model improvement. Our proposed modifications to DETR have +resulted in superior performance, surpassing previous rotated DETR models and +other alternatives. This is evidenced by our model's state-of-the-art +achievements in benchmarks such as DOTA-v1.0/v1.5/v2.0, and DIOR-R. + +
+
+ comment: Under review, 16 pages, 12 tables, 8 figures +
+
+
+
+
+ + ♻ ☆ DPSUR: Accelerating Differentially Private Stochastic Gradient Descent + Using Selective Update and Release VLDB 2024 + + +
+ Machine learning models are known to memorize private data to reduce their +training loss, which can be inadvertently exploited by privacy attacks such as +model inversion and membership inference. To protect against these attacks, +differential privacy (DP) has become the de facto standard for +privacy-preserving machine learning, particularly those popular training +algorithms using stochastic gradient descent, such as DPSGD. Nonetheless, DPSGD +still suffers from severe utility loss due to its slow convergence. This is +partially caused by the random sampling, which brings bias and variance to the +gradient, and partially by the Gaussian noise, which leads to fluctuation of +gradient updates. + Our key idea to address these issues is to apply selective updates to the +model training, while discarding those useless or even harmful updates. +Motivated by this, this paper proposes DPSUR, a Differentially Private training +framework based on Selective Updates and Release, where the gradient from each +iteration is evaluated based on a validation test, and only those updates +leading to convergence are applied to the model. As such, DPSUR ensures the +training in the right direction and thus can achieve faster convergence than +DPSGD. The main challenges lie in two aspects -- privacy concerns arising from +gradient evaluation, and gradient selection strategy for model update. To +address the challenges, DPSUR introduces a clipping strategy for update +randomization and a threshold mechanism for gradient selection. Experiments +conducted on MNIST, FMNIST, CIFAR-10, and IMDB datasets show that DPSUR +significantly outperforms previous works in terms of convergence speed and +model utility. + +
+
+ comment: This paper has been accepted by VLDB 2024 +
+
+
+
+
+ + ♻ ☆ Discovering Predictable Latent Factors for Time Series Forecasting + + +
+ Modern time series forecasting methods, such as Transformer and its variants, +have shown strong ability in sequential data modeling. To achieve high +performance, they usually rely on redundant or unexplainable structures to +model complex relations between variables and tune the parameters with +large-scale data. Many real-world data mining tasks, however, lack sufficient +variables for relation reasoning, and therefore these methods may not properly +handle such forecasting problems. With insufficient data, time series appear to +be affected by many exogenous variables, and thus, the modeling becomes +unstable and unpredictable. To tackle this critical issue, in this paper, we +develop a novel algorithmic framework for inferring the intrinsic latent +factors implied by the observable time series. The inferred factors are used to +form multiple independent and predictable signal components that enable not +only sparse relation reasoning for long-term efficiency but also reconstructing +the future temporal data for accurate prediction. To achieve this, we introduce +three characteristics, i.e., predictability, sufficiency, and identifiability, +and model these characteristics via the powerful deep latent dynamics models to +infer the predictable signal components. Empirical results on multiple real +datasets show the efficiency of our method for different kinds of time series +forecasting. The statistical analysis validates the predictability of the +learned latent factors. + +
+
+
+
+
+ + ♻ ☆ A Multivariate Unimodality Test Harnenssing the Dip Statistic of + Mahalanobis Distances Over Random Projections + + +
+ Unimodality, pivotal in statistical analysis, offers insights into dataset +structures and drives sophisticated analytical procedures. While unimodality's +confirmation is straightforward for one-dimensional data using methods like +Silverman's approach and Hartigans' dip statistic, its generalization to higher +dimensions remains challenging. By extrapolating one-dimensional unimodality +principles to multi-dimensional spaces through linear random projections and +leveraging point-to-point distancing, our method, rooted in +$\alpha$-unimodality assumptions, presents a novel multivariate unimodality +test named mud-pod. Both theoretical and empirical studies confirm the efficacy +of our method in unimodality assessment of multidimensional datasets as well as +in estimating the number of clusters. + +
+
+ comment: 11 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Meta-Learning with a Geometry-Adaptive Preconditioner CVPR 2023 + + +
+ Model-agnostic meta-learning (MAML) is one of the most successful +meta-learning algorithms. It has a bi-level optimization structure where the +outer-loop process learns a shared initialization and the inner-loop process +optimizes task-specific weights. Although MAML relies on the standard gradient +descent in the inner-loop, recent studies have shown that controlling the +inner-loop's gradient descent with a meta-learned preconditioner can be +beneficial. Existing preconditioners, however, cannot simultaneously adapt in a +task-specific and path-dependent way. Additionally, they do not satisfy the +Riemannian metric condition, which can enable the steepest descent learning +with preconditioned gradient. In this study, we propose Geometry-Adaptive +Preconditioned gradient descent (GAP) that can overcome the limitations in +MAML; GAP can efficiently meta-learn a preconditioner that is dependent on +task-specific parameters, and its preconditioner can be shown to be a +Riemannian metric. Thanks to the two properties, the geometry-adaptive +preconditioner is effective for improving the inner-loop optimization. +Experiment results show that GAP outperforms the state-of-the-art MAML family +and preconditioned gradient descent-MAML (PGD-MAML) family in a variety of +few-shot learning tasks. Code is available at: +https://github.com/Suhyun777/CVPR23-GAP. + +
+
+ comment: Accepted at CVPR 2023. Code is available at: + https://github.com/Suhyun777/CVPR23-GAP; This is an extended version of our + previous CVPR23 work +
+
+
+
+
+ + ♻ ☆ Meaning Representations from Trajectories in Autoregressive Models + + +
+ We propose to extract meaning representations from autoregressive language +models by considering the distribution of all possible trajectories extending +an input text. This strategy is prompt-free, does not require fine-tuning, and +is applicable to any pre-trained autoregressive model. Moreover, unlike +vector-based representations, distribution-based representations can also model +asymmetric relations (e.g., direction of logical entailment, hypernym/hyponym +relations) by using algebraic operations between likelihood functions. These +ideas are grounded in distributional perspectives on semantics and are +connected to standard constructions in automata theory, but to our knowledge +they have not been applied to modern language models. We empirically show that +the representations obtained from large models align well with human +annotations, outperform other zero-shot and prompt-free methods on semantic +similarity tasks, and can be used to solve more complex entailment and +containment tasks that standard embeddings cannot handle. Finally, we extend +our method to represent data from different modalities (e.g., image and text) +using multimodal autoregressive models. Our code is available at: +https://github.com/tianyu139/meaning-as-trajectories + +
+
+
+
+
+ + ♻ ☆ An Efficient High-Dimensional Gene Selection Approach based on Binary + Horse Herd Optimization Algorithm for Biological Data Classification + + +
+ The Horse Herd Optimization Algorithm (HOA) is a new meta-heuristic algorithm +based on the behaviors of horses at different ages. The HOA was introduced +recently to solve complex and high-dimensional problems. This paper proposes a +binary version of the Horse Herd Optimization Algorithm (BHOA) in order to +solve discrete problems and select prominent feature subsets. Moreover, this +study provides a novel hybrid feature selection framework based on the BHOA and +a minimum Redundancy Maximum Relevance (MRMR) filter method. This hybrid +feature selection, which is more computationally efficient, produces a +beneficial subset of relevant and informative features. Since feature selection +is a binary problem, we have applied a new Transfer Function (TF), called +X-shape TF, which transforms continuous problems into binary search spaces. +Furthermore, the Support Vector Machine (SVM) is utilized to examine the +efficiency of the proposed method on ten microarray datasets, namely Lymphoma, +Prostate, Brain-1, DLBCL, SRBCT, Leukemia, Ovarian, Colon, Lung, and MLL. In +comparison to other state-of-the-art, such as the Gray Wolf (GW), Particle +Swarm Optimization (PSO), and Genetic Algorithm (GA), the proposed hybrid +method (MRMR-BHOA) demonstrates superior performance in terms of accuracy and +minimum selected features. Also, experimental results prove that the X-Shaped +BHOA approach outperforms others methods. + +
+
+
+
+
+ + ♻ ☆ Business Policy Experiments using Fractional Factorial Designs: Consumer + Retention on DoorDash + + +
+ This paper investigates an approach to both speed up business decision-making +and lower the cost of learning through experimentation by factorizing business +policies and employing fractional factorial experimental designs for their +evaluation. We illustrate how this method integrates with advances in the +estimation of heterogeneous treatment effects, elaborating on its advantages +and foundational assumptions. We empirically demonstrate the implementation and +benefits of our approach and assess its validity in evaluating consumer +promotion policies at DoorDash, which is one of the largest delivery platforms +in the US. Our approach discovers a policy with 5% incremental profit at 67% +lower implementation cost. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Estimates on Learning Rates for Multi-Penalty Distribution Regression + + +
+ This paper is concerned with functional learning by utilizing two-stage +sampled distribution regression. We study a multi-penalty regularization +algorithm for distribution regression under the framework of learning theory. +The algorithm aims at regressing to real valued outputs from probability +measures. The theoretical analysis on distribution regression is far from +maturity and quite challenging, since only second stage samples are observable +in practical setting. In the algorithm, to transform information from samples, +we embed the distributions to a reproducing kernel Hilbert space +$\mathcal{H}_K$ associated with Mercer kernel $K$ via mean embedding technique. +The main contribution of the paper is to present a novel multi-penalty +regularization algorithm to capture more features of distribution regression +and derive optimal learning rates for the algorithm. The work also derives +learning rates for distribution regression in the nonstandard setting +$f_{\rho}\notin\mathcal{H}_K$, which is not explored in existing literature. +Moreover, we propose a distribution regression-based distributed learning +algorithm to face large-scale data or information challenge. The optimal +learning rates are derived for the distributed learning algorithm. By providing +new algorithms and showing their learning rates, we improve the existing work +in different aspects in the literature. + +
+
+
+
+
+ + ♻ ☆ Backdiff: a diffusion model for generalized transferable protein + backmapping + + +
+ Coarse-grained (CG) models play a crucial role in the study of protein +structures, protein thermodynamic properties, and protein conformation +dynamics. Due to the information loss in the coarse-graining process, +backmapping from CG to all-atom configurations is essential in many protein +design and drug discovery applications when detailed atomic representations are +needed for in-depth studies. Despite recent progress in data-driven backmapping +approaches, devising a backmapping method that can be universally applied +across various CG models and proteins remains unresolved. In this work, we +propose BackDiff, a new generative model designed to achieve generalization and +reliability in the protein backmapping problem. BackDiff leverages the +conditional score-based diffusion model with geometric representations. Since +different CG models can contain different coarse-grained sites which include +selected atoms (CG atoms) and simple CG auxiliary functions of atomistic +coordinates (CG auxiliary variables), we design a self-supervised training +framework to adapt to different CG atoms, and constrain the diffusion sampling +paths with arbitrary CG auxiliary variables as conditions. Our method +facilitates end-to-end training and allows efficient sampling across different +proteins and diverse CG models without the need for retraining. Comprehensive +experiments over multiple popular CG models demonstrate BackDiff's superior +performance to existing state-of-the-art approaches, and generalization and +flexibility that these approaches cannot achieve. A pretrained BackDiff model +can offer a convenient yet reliable plug-and-play solution for protein +researchers, enabling them to investigate further from their own CG models. + +
+
+ comment: 22 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Zero-Shot Self-Supervised Learning for MRI Reconstruction + + +
+ Deep learning (DL) has emerged as a powerful tool for accelerated MRI +reconstruction, but often necessitates a database of fully-sampled measurements +for training. Recent self-supervised and unsupervised learning approaches +enable training without fully-sampled data. However, a database of undersampled +measurements may not be available in many scenarios, especially for scans +involving contrast or translational acquisitions in development. Moreover, +recent studies show that database-trained models may not generalize well when +the unseen measurements differ in terms of sampling pattern, acceleration rate, +SNR, image contrast, and anatomy. Such challenges necessitate a new methodology +to enable subject-specific DL MRI reconstruction without external training +datasets, since it is clinically imperative to provide high-quality +reconstructions that can be used to identify lesions/disease for \emph{every +individual}. In this work, we propose a zero-shot self-supervised learning +approach to perform subject-specific accelerated DL MRI reconstruction to +tackle these issues. The proposed approach partitions the available +measurements from a single scan into three disjoint sets. Two of these sets are +used to enforce data consistency and define loss during training for +self-supervision, while the last set serves to self-validate, establishing an +early stopping criterion. In the presence of models pre-trained on a database +with different image characteristics, we show that the proposed approach can be +combined with transfer learning for faster convergence time and reduced +computational complexity. The code is available at +\url{https://github.com/byaman14/ZS-SSL}. + +
+
+
+
+
+ + ♻ ☆ Graph-based Molecular Representation Learning + + +
+ Molecular representation learning (MRL) is a key step to build the connection +between machine learning and chemical science. In particular, it encodes +molecules as numerical vectors preserving the molecular structures and +features, on top of which the downstream tasks (e.g., property prediction) can +be performed. Recently, MRL has achieved considerable progress, especially in +methods based on deep molecular graph learning. In this survey, we +systematically review these graph-based molecular representation techniques, +especially the methods incorporating chemical domain knowledge. Specifically, +we first introduce the features of 2D and 3D molecular graphs. Then we +summarize and categorize MRL methods into three groups based on their input. +Furthermore, we discuss some typical chemical applications supported by MRL. To +facilitate studies in this fast-developing area, we also list the benchmarks +and commonly used datasets in the paper. Finally, we share our thoughts on +future research directions. + +
+
+
+
+
+ + ♻ ☆ Using Stochastic Gradient Descent to Smooth Nonconvex Functions: + Analysis of Implicit Graduated Optimization with Optimal Noise Scheduling + + +
+ The graduated optimization approach is a heuristic method for finding +globally optimal solutions for nonconvex functions and has been theoretically +analyzed in several studies. This paper defines a new family of nonconvex +functions for graduated optimization, discusses their sufficient conditions, +and provides a convergence analysis of the graduated optimization algorithm for +them. It shows that stochastic gradient descent (SGD) with mini-batch +stochastic gradients has the effect of smoothing the function, the degree of +which is determined by the learning rate and batch size. This finding provides +theoretical insights on why large batch sizes fall into sharp local minima, why +decaying learning rates and increasing batch sizes are superior to fixed +learning rates and batch sizes, and what the optimal learning rate scheduling +is. To the best of our knowledge, this is the first paper to provide a +theoretical explanation for these aspects. Moreover, a new graduated +optimization framework that uses a decaying learning rate and increasing batch +size is analyzed and experimental results of image classification that support +our theoretical findings are reported. + +
+
+ comment: The latest version was updated on Nov. 29 +
+
+
+
+
+ + ♻ ☆ Beyond Invariance: Test-Time Label-Shift Adaptation for Distributions + with "Spurious" Correlations + + +
+ Changes in the data distribution at test time can have deleterious effects on +the performance of predictive models $p(y|x)$. We consider situations where +there are additional meta-data labels (such as group labels), denoted by $z$, +that can account for such changes in the distribution. In particular, we assume +that the prior distribution $p(y, z)$, which models the dependence between the +class label $y$ and the "nuisance" factors $z$, may change across domains, +either due to a change in the correlation between these terms, or a change in +one of their marginals. However, we assume that the generative model for +features $p(x|y,z)$ is invariant across domains. We note that this corresponds +to an expanded version of the widely used "label shift" assumption, where the +labels now also include the nuisance factors $z$. Based on this observation, we +propose a test-time label shift correction that adapts to changes in the joint +distribution $p(y, z)$ using EM applied to unlabeled samples from the target +domain distribution, $p_t(x)$. Importantly, we are able to avoid fitting a +generative model $p(x|y, z)$, and merely need to reweight the outputs of a +discriminative model $p_s(y, z|x)$ trained on the source distribution. We +evaluate our method, which we call "Test-Time Label-Shift Adaptation" (TTLSA), +on several standard image and text datasets, as well as the CheXpert chest +X-ray dataset, and show that it improves performance over methods that target +invariance to changes in the distribution, as well as baseline empirical risk +minimization methods. Code for reproducing experiments is available at +https://github.com/nalzok/test-time-label-shift . + +
+
+ comment: 24 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ BOIS: Bayesian Optimization of Interconnected Systems + + +
+ Bayesian optimization (BO) has proven to be an effective paradigm for the +global optimization of expensive-to-sample systems. One of the main advantages +of BO is its use of Gaussian processes (GPs) to characterize model uncertainty +which can be leveraged to guide the learning and search process. However, BO +typically treats systems as black-boxes and this limits the ability to exploit +structural knowledge (e.g., physics and sparse interconnections). Composite +functions of the form $f(x, y(x))$, wherein GP modeling is shifted from the +performance function $f$ to an intermediate function $y$, offer an avenue for +exploiting structural knowledge. However, the use of composite functions in a +BO framework is complicated by the need to generate a probability density for +$f$ from the Gaussian density of $y$ calculated by the GP (e.g., when $f$ is +nonlinear it is not possible to obtain a closed-form expression). Previous work +has handled this issue using sampling techniques; these are easy to implement +and flexible but are computationally intensive. In this work, we introduce a +new paradigm which allows for the efficient use of composite functions in BO; +this uses adaptive linearizations of $f$ to obtain closed-form expressions for +the statistical moments of the composite function. We show that this simple +approach (which we call BOIS) enables the exploitation of structural knowledge, +such as that arising in interconnected systems as well as systems that embed +multiple GP models and combinations of physics and GP models. Using a chemical +process optimization case study, we benchmark the effectiveness of BOIS against +standard BO and sampling approaches. Our results indicate that BOIS achieves +performance gains and accurately captures the statistics of composite +functions. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Direction-oriented Multi-objective Learning: Simple and Provable + Stochastic Algorithms + + +
+ Multi-objective optimization (MOO) has become an influential framework in +many machine learning problems with multiple objectives such as learning with +multiple criteria and multi-task learning (MTL). In this paper, we propose a +new direction-oriented multi-objective problem by regularizing the common +descent direction within a neighborhood of a direction that optimizes a linear +combination of objectives such as the average loss in MTL. This formulation +includes GD and MGDA as special cases, enjoys the direction-oriented benefit as +in CAGrad, and facilitates the design of stochastic algorithms. To solve this +problem, we propose Stochastic Direction-oriented Multi-objective Gradient +descent (SDMGrad) with simple SGD type of updates, and its variant SDMGrad-OS +with an efficient objective sampling in the setting where the number of +objectives is large. For a constant-level regularization parameter $\lambda$, +we show that SDMGrad and SDMGrad-OS provably converge to a Pareto stationary +point with improved complexities and milder assumptions. For an increasing +$\lambda$, this convergent point reduces to a stationary point of the linear +combination of objectives. We demonstrate the superior performance of the +proposed methods in a series of tasks on multi-task supervised learning and +reinforcement learning. Code is provided at +https://github.com/ml-opt-lab/sdmgrad. + +
+
+
+
+
+ + ♻ ☆ Inference of CO2 flow patterns -- a feasibility study NeurIPS 2023 + + +
+ As the global deployment of carbon capture and sequestration (CCS) technology +intensifies in the fight against climate change, it becomes increasingly +imperative to establish robust monitoring and detection mechanisms for +potential underground CO2 leakage, particularly through pre-existing or induced +faults in the storage reservoir's seals. While techniques such as history +matching and time-lapse seismic monitoring of CO2 storage have been used +successfully in tracking the evolution of CO2 plumes in the subsurface, these +methods lack principled approaches to characterize uncertainties related to the +CO2 plumes' behavior. Inclusion of systematic assessment of uncertainties is +essential for risk mitigation for the following reasons: (i) CO2 plume-induced +changes are small and seismic data is noisy; (ii) changes between regular and +irregular (e.g., caused by leakage) flow patterns are small; and (iii) the +reservoir properties that control the flow are strongly heterogeneous and +typically only available as distributions. To arrive at a formulation capable +of inferring flow patterns for regular and irregular flow from well and seismic +data, the performance of conditional normalizing flow will be analyzed on a +series of carefully designed numerical experiments. While the inferences +presented are preliminary in the context of an early CO2 leakage detection +system, the results do indicate that inferences with conditional normalizing +flows can produce high-fidelity estimates for CO2 plumes with or without +leakage. We are also confident that the inferred uncertainty is reasonable +because it correlates well with the observed errors. This uncertainty stems +from noise in the seismic data and from the lack of precise knowledge of the +reservoir's fluid flow properties. + +
+
+ comment: Accepted in NeurIPS 2023 Workshop - Tackling Climate Change with + Machine Learning (Spotlight) +
+
+
+
+
+ + ♻ ☆ ChatTraffic: Text-to-Traffic Generation via Diffusion Model + + +
+ Traffic prediction is one of the most significant foundations in Intelligent +Transportation Systems (ITS). Traditional traffic prediction methods rely only +on historical traffic data to predict traffic trends and face two main +challenges. 1) insensitivity to unusual events. 2) poor performance in +long-term prediction. In this work, we explore how generative models combined +with text describing the traffic system can be applied for traffic generation +and name the task Text-to-Traffic Generation (TTG). The key challenge of the +TTG task is how to associate text with the spatial structure of the road +network and traffic data for generating traffic situations. To this end, we +propose ChatTraffic, the first diffusion model for text-to-traffic generation. +To guarantee the consistency between synthetic and real data, we augment a +diffusion model with the Graph Convolutional Network (GCN) to extract spatial +correlations of traffic data. In addition, we construct a large dataset +containing text-traffic pairs for the TTG task. We benchmarked our model +qualitatively and quantitatively on the released dataset. The experimental +results indicate that ChatTraffic can generate realistic traffic situations +from the text. Our code and dataset are available at +https://github.com/ChyaZhang/ChatTraffic. + +
+
+
+
+
+ + ♻ ☆ Dynamic DAG Discovery for Interpretable Imitation Learning + + +
+ Imitation learning, which learns agent policy by mimicking expert +demonstration, has shown promising results in many applications such as medical +treatment regimes and self-driving vehicles. However, it remains a difficult +task to interpret control policies learned by the agent. Difficulties mainly +come from two aspects: 1) agents in imitation learning are usually implemented +as deep neural networks, which are black-box models and lack interpretability; +2) the latent causal mechanism behind agents' decisions may vary along the +trajectory, rather than staying static throughout time steps. To increase +transparency and offer better interpretability of the neural agent, we propose +to expose its captured knowledge in the form of a directed acyclic causal +graph, with nodes being action and state variables and edges denoting the +causal relations behind predictions. Furthermore, we design this causal +discovery process to be state-dependent, enabling it to model the dynamics in +latent causal graphs. Concretely, we conduct causal discovery from the +perspective of Granger causality and propose a self-explainable imitation +learning framework, {\method}. The proposed framework is composed of three +parts: a dynamic causal discovery module, a causality encoding module, and a +prediction module, and is trained in an end-to-end manner. After the model is +learned, we can obtain causal relations among states and action variables +behind its decisions, exposing policies learned by it. Experimental results on +both synthetic and real-world datasets demonstrate the effectiveness of the +proposed {\method} in learning the dynamic causal graphs for understanding the +decision-making of imitation learning meanwhile maintaining high prediction +accuracy. + +
+
+
+
+
+ + ♻ ☆ A novel decomposed-ensemble time series forecasting framework: capturing + underlying volatility information + + +
+ Time series forecasting represents a significant and challenging task across +various fields. Recently, methods based on mode decomposition have dominated +the forecasting of complex time series because of the advantages of capturing +local characteristics and extracting intrinsic modes from data. Unfortunately, +most models fail to capture the implied volatilities that contain significant +information. To enhance the prediction of contemporary diverse and complex time +series, we propose a novel time series forecasting paradigm that integrates +decomposition with the capability to capture the underlying fluctuation +information of the series. In our methodology, we implement the Variational +Mode Decomposition algorithm to decompose the time series into K distinct +sub-modes. Following this decomposition, we apply the Generalized +Autoregressive Conditional Heteroskedasticity (GARCH) model to extract the +volatility information in these sub-modes. Subsequently, both the numerical +data and the volatility information for each sub-mode are harnessed to train a +neural network. This network is adept at predicting the information of the +sub-modes, and we aggregate the predictions of all sub-modes to generate the +final output. By integrating econometric and artificial intelligence methods, +and taking into account both the numerical and volatility information of the +time series, our proposed framework demonstrates superior performance in time +series forecasting, as evidenced by the significant decrease in MSE, RMSE, and +MAPE in our comparative experimental results. + +
+
+
+
+
+ + ♻ ☆ Generative quantum machine learning via denoising diffusion + probabilistic models + + +
+ Deep generative models are key-enabling technology to computer vision, text +generation and large language models. Denoising diffusion probabilistic models +(DDPMs) have recently gained much attention due to their ability to generate +diverse and high-quality samples in many computer vision tasks, as well as to +incorporate flexible model architectures and relatively simple training scheme. +Quantum generative models, empowered by entanglement and superposition, have +brought new insight to learning classical and quantum data. Inspired by the +classical counterpart, we propose the quantum denoising diffusion probabilistic +models (QuDDPM) to enable efficiently trainable generative learning of quantum +data. QuDDPM adopts sufficient layers of circuits to guarantee expressivity, +while introduces multiple intermediate training tasks as interpolation between +the target distribution and noise to avoid barren plateau and guarantee +efficient training. We provide bounds on the learning error and demonstrate +QuDDPM's capability in learning correlated quantum noise model, quantum +many-body phases and topological structure of quantum data. The results provide +a paradigm for versatile and efficient quantum generative learning. + +
+
+ comment: 5+7 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Modular Quantization-Aware Training: Increasing Accuracy by Decreasing + Precision in 6D Object Pose Estimation + + +
+ Edge applications, such as collaborative robotics and spacecraft rendezvous, +demand efficient 6D object pose estimation on resource-constrained embedded +platforms. Existing 6D pose estimation networks are often too large for such +deployments, necessitating compression while maintaining reliable performance. +To address this challenge, we introduce Modular Quantization-Aware Training +(MQAT), an adaptive and mixed-precision quantization-aware training strategy +that exploits the modular structure of modern 6D pose estimation architectures. +MQAT guides a systematic gradated modular quantization sequence and determines +module-specific bit precisions, leading to quantized models that outperform +those produced by state-of-the-art uniform and mixed-precision quantization +techniques. Our experiments showcase the generality of MQAT across datasets, +architectures, and quantization algorithms. Remarkably, MQAT-trained quantized +models achieve a significant accuracy boost (>7%) over the baseline +full-precision network while reducing model size by a factor of 4x or more. + +
+
+
+
+
+ + ♻ ☆ GROOT: Learning to Follow Instructions by Watching Gameplay Videos + + +
+ We study the problem of building a controller that can follow open-ended +instructions in open-world environments. We propose to follow reference videos +as instructions, which offer expressive goal specifications while eliminating +the need for expensive text-gameplay annotations. A new learning framework is +derived to allow learning such instruction-following controllers from gameplay +videos while producing a video instruction encoder that induces a structured +goal space. We implement our agent GROOT in a simple yet effective +encoder-decoder architecture based on causal transformers. We evaluate GROOT +against open-world counterparts and human players on a proposed Minecraft +SkillForge benchmark. The Elo ratings clearly show that GROOT is closing the +human-machine gap as well as exhibiting a 70% winning rate over the best +generalist agent baseline. Qualitative analysis of the induced goal space +further demonstrates some interesting emergent properties, including the goal +composition and complex gameplay behavior synthesis. The project page is +available at https://craftjarvis-groot.github.io. + +
+
+
+
+
+ + ♻ ☆ A Good Feature Extractor Is All You Need for Weakly Supervised Learning + in Histopathology + + +
+ Deep learning is revolutionising pathology, offering novel opportunities in +disease prognosis and personalised treatment. Historically, stain normalisation +has been a crucial preprocessing step in computational pathology pipelines, and +persists into the deep learning era. Yet, with the emergence of feature +extractors trained using self-supervised learning (SSL) on diverse pathology +datasets, we call this practice into question. In an empirical evaluation of +publicly available feature extractors, we find that omitting stain +normalisation and image augmentations does not compromise downstream +performance, while incurring substantial savings in memory and compute. +Further, we show that the top-performing feature extractors are remarkably +robust to variations in stain and augmentations like rotation in their latent +space. Contrary to previous patch-level benchmarking studies, our approach +emphasises clinical relevance by focusing on slide-level prediction tasks in a +weakly supervised setting with external validation cohorts. This work +represents the most comprehensive robustness evaluation of public pathology SSL +feature extractors to date, involving more than 6,000 training runs across nine +tasks, five datasets, three downstream architectures, and various preprocessing +setups. Our findings stand to streamline digital pathology workflows by +minimising preprocessing needs and informing the selection of feature +extractors. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Cinematic Behavior Transfer via NeRF-based Differentiable Filming + + +
+ In the evolving landscape of digital media and video production, the precise +manipulation and reproduction of visual elements like camera movements and +character actions are highly desired. Existing SLAM methods face limitations in +dynamic scenes and human pose estimation often focuses on 2D projections, +neglecting 3D statuses. To address these issues, we first introduce a reverse +filming behavior estimation technique. It optimizes camera trajectories by +leveraging NeRF as a differentiable renderer and refining SMPL tracks. We then +introduce a cinematic transfer pipeline that is able to transfer various shot +types to a new 2D video or a 3D virtual environment. The incorporation of 3D +engine workflow enables superior rendering and control abilities, which also +achieves a higher rating in the user study. + +
+
+ comment: Project Page: + https://virtualfilmstudio.github.io/projects/cinetransfer +
+
+
+
+
+ + ☆ BAND-2k: Banding Artifact Noticeable Database for Banding Detection and + Quality Assessment + + +
+ Banding, also known as staircase-like contours, frequently occurs in flat +areas of images/videos processed by the compression or quantization algorithms. +As undesirable artifacts, banding destroys the original image structure, thus +degrading users' quality of experience (QoE). In this paper, we systematically +investigate the banding image quality assessment (IQA) problem, aiming to +detect the image banding artifacts and evaluate their perceptual visual +quality. Considering that the existing image banding databases only contain +limited content sources and banding generation methods, and lack perceptual +quality labels (i.e. mean opinion scores), we first build the largest banding +IQA database so far, named Banding Artifact Noticeable Database (BAND-2k), +which consists of 2,000 banding images generated by 15 compression and +quantization schemes. A total of 23 workers participated in the subjective IQA +experiment, yielding over 214,000 patch-level banding class labels and 44,371 +reliable image-level quality ratings. Subsequently, we develop an effective +no-reference (NR) banding evaluator for banding detection and quality +assessment by leveraging frequency characteristics of banding artifacts. A dual +convolutional neural network is employed to concurrently learn the feature +representation from the high-frequency and low-frequency maps, thereby +enhancing the ability to discern banding artifacts. The quality score of a +banding image is generated by pooling the banding detection maps masked by the +spatial frequency filters. Experiments demonstrate that our banding evaluator +achieves a remarkably high accuracy in banding detection and also exhibits high +SRCC and PLCC results with the perceptual quality labels. These findings unveil +the strong correlations between the intensity of banding artifacts and the +perceptual visual quality, thus validating the necessity of banding quality +assessment. + +
+
+
+
+
+ + ☆ Vulnerability of Automatic Identity Recognition to Audio-Visual + Deepfakes + + +
+ The task of deepfakes detection is far from being solved by speech or vision +researchers. Several publicly available databases of fake synthetic video and +speech were built to aid the development of detection methods. However, +existing databases typically focus on visual or voice modalities and provide no +proof that their deepfakes can in fact impersonate any real person. In this +paper, we present the first realistic audio-visual database of deepfakes +SWAN-DF, where lips and speech are well synchronized and video have high visual +and audio qualities. We took the publicly available SWAN dataset of real videos +with different identities to create audio-visual deepfakes using several models +from DeepFaceLab and blending techniques for face swapping and HiFiVC, DiffVC, +YourTTS, and FreeVC models for voice conversion. From the publicly available +speech dataset LibriTTS, we also created a separate database of only audio +deepfakes LibriTTS-DF using several latest text to speech methods: YourTTS, +Adaspeech, and TorToiSe. We demonstrate the vulnerability of a state of the art +speaker recognition system, such as ECAPA-TDNN-based model from SpeechBrain, to +the synthetic voices. Similarly, we tested face recognition system based on the +MobileFaceNet architecture to several variants of our visual deepfakes. The +vulnerability assessment show that by tuning the existing pretrained deepfake +models to specific identities, one can successfully spoof the face and speaker +recognition systems in more than 90% of the time and achieve a very realistic +looking and sounding fake video of a given person. + +
+
+ comment: 10 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Implicit-explicit Integrated Representations for Multi-view Video + Compression + + +
+ With the increasing consumption of 3D displays and virtual reality, +multi-view video has become a promising format. However, its high resolution +and multi-camera shooting result in a substantial increase in data volume, +making storage and transmission a challenging task. To tackle these +difficulties, we propose an implicit-explicit integrated representation for +multi-view video compression. Specifically, we first use the explicit +representation-based 2D video codec to encode one of the source views. +Subsequently, we propose employing the implicit neural representation +(INR)-based codec to encode the remaining views. The implicit codec takes the +time and view index of multi-view video as coordinate inputs and generates the +corresponding implicit reconstruction frames.To enhance the compressibility, we +introduce a multi-level feature grid embedding and a fully convolutional +architecture into the implicit codec. These components facilitate +coordinate-feature and feature-RGB mapping, respectively. To further enhance +the reconstruction quality from the INR codec, we leverage the high-quality +reconstructed frames from the explicit codec to achieve inter-view +compensation. Finally, the compensated results are fused with the implicit +reconstructions from the INR to obtain the final reconstructed frames. Our +proposed framework combines the strengths of both implicit neural +representation and explicit 2D codec. Extensive experiments conducted on public +datasets demonstrate that the proposed framework can achieve comparable or even +superior performance to the latest multi-view video compression standard MIV +and other INR-based schemes in terms of view compression and scene modeling. + +
+
+
+
+
+ + ☆ eMotions: A Large-Scale Dataset for Emotion Recognition in Short Videos + + +
+ Nowadays, short videos (SVs) are essential to information acquisition and +sharing in our life. The prevailing use of SVs to spread emotions leads to the +necessity of emotion recognition in SVs. Considering the lack of SVs emotion +data, we introduce a large-scale dataset named eMotions, comprising 27,996 +videos. Meanwhile, we alleviate the impact of subjectivities on labeling +quality by emphasizing better personnel allocations and multi-stage +annotations. In addition, we provide the category-balanced and test-oriented +variants through targeted data sampling. Some commonly used videos (e.g., +facial expressions and postures) have been well studied. However, it is still +challenging to understand the emotions in SVs. Since the enhanced content +diversity brings more distinct semantic gaps and difficulties in learning +emotion-related features, and there exists information gaps caused by the +emotion incompleteness under the prevalently audio-visual co-expressions. To +tackle these problems, we present an end-to-end baseline method AV-CPNet that +employs the video transformer to better learn semantically relevant +representations. We further design the two-stage cross-modal fusion module to +complementarily model the correlations of audio-visual features. The EP-CE +Loss, incorporating three emotion polarities, is then applied to guide model +optimization. Extensive experimental results on nine datasets verify the +effectiveness of AV-CPNet. Datasets and code will be open on +https://github.com/XuecWu/eMotions. + +
+
+
+
+
+ + ♻ ☆ TCDM: Transformational Complexity Based Distortion Metric for Perceptual + Point Cloud Quality Assessment + + +
+ The goal of objective point cloud quality assessment (PCQA) research is to +develop quantitative metrics that measure point cloud quality in a perceptually +consistent manner. Merging the research of cognitive science and intuition of +the human visual system (HVS), in this paper, we evaluate the point cloud +quality by measuring the complexity of transforming the distorted point cloud +back to its reference, which in practice can be approximated by the code length +of one point cloud when the other is given. For this purpose, we first make +space segmentation for the reference and distorted point clouds based on a 3D +Voronoi diagram to obtain a series of local patch pairs. Next, inspired by the +predictive coding theory, we utilize a space-aware vector autoregressive +(SA-VAR) model to encode the geometry and color channels of each reference +patch with and without the distorted patch, respectively. Assuming that the +residual errors follow the multi-variate Gaussian distributions, the +self-complexity of the reference and transformational complexity between the +reference and distorted samples are computed using covariance matrices. +Additionally, the prediction terms generated by SA-VAR are introduced as one +auxiliary feature to promote the final quality prediction. The effectiveness of +the proposed transformational complexity based distortion metric (TCDM) is +evaluated through extensive experiments conducted on five public point cloud +quality assessment databases. The results demonstrate that TCDM achieves +state-of-the-art (SOTA) performance, and further analysis confirms its +robustness in various scenarios. The code is publicly available at +https://github.com/zyj1318053/TCDM. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 73 + +
+
+
+ + ☆ MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced + Training + + +
+ Contrastive pretraining of image-text foundation models, such as CLIP, +demonstrated excellent zero-shot performance and improved robustness on a wide +range of downstream tasks. However, these models utilize large +transformer-based encoders with significant memory and latency overhead which +pose challenges for deployment on mobile devices. In this work, we introduce +MobileCLIP -- a new family of efficient image-text models optimized for runtime +performance along with a novel and efficient training approach, namely +multi-modal reinforced training. The proposed training approach leverages +knowledge transfer from an image captioning model and an ensemble of strong +CLIP encoders to improve the accuracy of efficient models. Our approach avoids +train-time compute overhead by storing the additional knowledge in a reinforced +dataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for +zero-shot classification and retrieval tasks on several datasets. Our +MobileCLIP-S2 variant is 2.3$\times$ faster while more accurate compared to +previous best CLIP model based on ViT-B/16. We further demonstrate the +effectiveness of our multi-modal reinforced training by training a CLIP model +based on ViT-B/16 image backbone and achieving +2.9% average performance +improvement on 38 evaluation benchmarks compared to the previous best. +Moreover, we show that the proposed approach achieves 10$\times$-1000$\times$ +improved learning efficiency when compared with non-reinforced CLIP training. + +
+
+
+
+
+ + ☆ LLaMA-VID: An Image is Worth 2 Tokens in Large Language Models + + +
+ In this work, we present a novel method to tackle the token generation +challenge in Vision Language Models (VLMs) for video and image understanding, +called LLaMA-VID. Current VLMs, while proficient in tasks like image captioning +and visual question answering, face computational burdens when processing long +videos due to the excessive visual tokens. LLaMA-VID addresses this issue by +representing each frame with two distinct tokens, namely context token and +content token. The context token encodes the overall image context based on +user input, whereas the content token encapsulates visual cues in each frame. +This dual-token strategy significantly reduces the overload of long videos +while preserving critical information. Generally, LLaMA-VID empowers existing +frameworks to support hour-long videos and pushes their upper limit with an +extra context token. It is proved to surpass previous methods on most of video- +or image-based benchmarks. Code is available +https://github.com/dvlab-research/LLaMA-VID}{https://github.com/dvlab-research/LLaMA-VID + +
+
+ comment: Code is available at https://github.com/dvlab-research/LLaMA-VID +
+
+
+
+
+ + ☆ Efficient In-Context Learning in Vision-Language Models for Egocentric + Videos + + +
+ Recent advancements in text-only large language models (LLMs) have +highlighted the benefit of in-context learning for adapting to new tasks with a +few demonstrations. However, extending in-context learning to large +vision-language models (VLMs) using a huge amount of naturalistic +vision-language data has shown limited success, particularly for egocentric +videos, due to high data collection costs. We propose a novel training method +$\mathbb{E}$fficient $\mathbb{I}$n-context $\mathbb{L}$earning on +$\mathbb{E}$gocentric $\mathbb{V}$ideos ($\mathbb{EILEV}$), which elicits +in-context learning in VLMs for egocentric videos without requiring massive, +naturalistic egocentric video datasets. $\mathbb{EILEV}$ involves architectural +and training data adaptations to allow the model to process contexts +interleaved with video clips and narrations, sampling of in-context examples +with clusters of similar verbs and nouns, use of data with skewed marginal +distributions with a long tail of infrequent verbs and nouns, as well as +homonyms and synonyms. Our evaluations show that $\mathbb{EILEV}$-trained +models outperform larger VLMs trained on a huge amount of naturalistic data in +in-context learning. Furthermore, they can generalize to not only +out-of-distribution, but also novel, rare egocentric videos and texts via +in-context learning, demonstrating potential for applications requiring +cost-effective training, and rapid post-deployment adaptability. Our code and +demo are available at \url{https://github.com/yukw777/EILEV}. + +
+
+
+
+
+ + ☆ Scalable Extraction of Training Data from (Production) Language Models + + +
+ This paper studies extractable memorization: training data that an adversary +can efficiently extract by querying a machine learning model without prior +knowledge of the training dataset. We show an adversary can extract gigabytes +of training data from open-source language models like Pythia or GPT-Neo, +semi-open models like LLaMA or Falcon, and closed models like ChatGPT. Existing +techniques from the literature suffice to attack unaligned models; in order to +attack the aligned ChatGPT, we develop a new divergence attack that causes the +model to diverge from its chatbot-style generations and emit training data at a +rate 150x higher than when behaving properly. Our methods show practical +attacks can recover far more data than previously thought, and reveal that +current alignment techniques do not eliminate memorization. + +
+
+
+
+
+ + ☆ Is This the Subspace You Are Looking for? An Interpretability Illusion + for Subspace Activation Patching NeurIPS 2023 + + +
+ Mechanistic interpretability aims to understand model behaviors in terms of +specific, interpretable features, often hypothesized to manifest as +low-dimensional subspaces of activations. Specifically, recent studies have +explored subspace interventions (such as activation patching) as a way to +simultaneously manipulate model behavior and attribute the features behind it +to given subspaces. + In this work, we demonstrate that these two aims diverge, potentially leading +to an illusory sense of interpretability. Counterintuitively, even if a +subspace intervention makes the model's output behave as if the value of a +feature was changed, this effect may be achieved by activating a dormant +parallel pathway leveraging another subspace that is causally disconnected from +model outputs. We demonstrate this phenomenon in a distilled mathematical +example, in two real-world domains (the indirect object identification task and +factual recall), and present evidence for its prevalence in practice. In the +context of factual recall, we further show a link to rank-1 fact editing, +providing a mechanistic explanation for previous work observing an +inconsistency between fact editing performance and fact localization. + However, this does not imply that activation patching of subspaces is +intrinsically unfit for interpretability. To contextualize our findings, we +also show what a success case looks like in a task (indirect object +identification) where prior manual circuit analysis informs an understanding of +the location of a feature. We explore the additional evidence needed to argue +that a patched subspace is faithful. + +
+
+ comment: NeurIPS 2023 Workshop on Attributing Model Behavior at Scale +
+
+
+
+
+ + ☆ ChatGPT's One-year Anniversary: Are Open-Source Large Language Models + Catching up? + + +
+ Upon its release in late 2022, ChatGPT has brought a seismic shift in the +entire landscape of AI, both in research and commerce. Through +instruction-tuning a large language model (LLM) with supervised fine-tuning and +reinforcement learning from human feedback, it showed that a model could answer +human questions and follow instructions on a broad panel of tasks. Following +this success, interests in LLMs have intensified, with new LLMs flourishing at +frequent interval across academia and industry, including many start-ups +focused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's +Claude) generally outperform their open-source counterparts, the progress on +the latter has been rapid with claims of achieving parity or even better on +certain tasks. This has crucial implications not only on research but also on +business. In this work, on the first anniversary of ChatGPT, we provide an +exhaustive overview of this success, surveying all tasks where an open-source +LLM has claimed to be on par or better than ChatGPT. + +
+
+
+
+
+ + ☆ Assessing the influence of attractor-verb distance on grammatical + agreement in humans and language models EMNLP 2023 + + +
+ Subject-verb agreement in the presence of an attractor noun located between +the main noun and the verb elicits complex behavior: judgments of +grammaticality are modulated by the grammatical features of the attractor. For +example, in the sentence "The girl near the boys likes climbing", the attractor +(boys) disagrees in grammatical number with the verb (likes), creating a +locally implausible transition probability. Here, we parametrically modulate +the distance between the attractor and the verb while keeping the length of the +sentence equal. We evaluate the performance of both humans and two artificial +neural network models: both make more mistakes when the attractor is closer to +the verb, but neural networks get close to the chance level while humans are +mostly able to overcome the attractor interference. Additionally, we report a +linear effect of attractor distance on reaction times. We hypothesize that a +possible reason for the proximity effect is the calculation of transition +probabilities between adjacent words. Nevertheless, classical models of +attraction such as the cue-based model might suffice to explain this +phenomenon, thus paving the way for new research. Data and analyses available +at https://osf.io/d4g6k + +
+
+ comment: 10 pages (5 main, 2 refs, 3 supplementary) ; 5 figures (3 main, 2 + supplementary) ; accepted at EMNLP 2023 (no DOI yet) +
+
+
+
+
+ + ☆ Natural Language Processing Through Transfer Learning: A Case Study on + Sentiment Analysis + + +
+ Artificial intelligence and machine learning have significantly bolstered the +technological world. This paper explores the potential of transfer learning in +natural language processing focusing mainly on sentiment analysis. The models +trained on the big data can also be used where data are scarce. The claim is +that, compared to training models from scratch, transfer learning, using +pre-trained BERT models, can increase sentiment classification accuracy. The +study adopts a sophisticated experimental design that uses the IMDb dataset of +sentimentally labelled movie reviews. Pre-processing includes tokenization and +encoding of text data, making it suitable for NLP models. The dataset is used +on a BERT based model, measuring its performance using accuracy. The result +comes out to be 100 per cent accurate. Although the complete accuracy could +appear impressive, it might be the result of overfitting or a lack of +generalization. Further analysis is required to ensure the model's ability to +handle diverse and unseen data. The findings underscore the effectiveness of +transfer learning in NLP, showcasing its potential to excel in sentiment +analysis tasks. However, the research calls for a cautious interpretation of +perfect accuracy and emphasizes the need for additional measures to validate +the model's generalization. + +
+
+ comment: 12 pages, 1 table, 4 figures +
+
+
+
+
+ + ☆ Debiasing Multimodal Models via Causal Information Minimization EMNLP 2023 + + +
+ Most existing debiasing methods for multimodal models, including causal +intervention and inference methods, utilize approximate heuristics to represent +the biases, such as shallow features from early stages of training or unimodal +features for multimodal tasks like VQA, etc., which may not be accurate. In +this paper, we study bias arising from confounders in a causal graph for +multimodal data and examine a novel approach that leverages causally-motivated +information minimization to learn the confounder representations. Robust +predictive features contain diverse information that helps a model generalize +to out-of-distribution data. Hence, minimizing the information content of +features obtained from a pretrained biased model helps learn the simplest +predictive features that capture the underlying data distribution. We treat +these features as confounder representations and use them via methods motivated +by causal theory to remove bias from models. We find that the learned +confounder representations indeed capture dataset biases, and the proposed +debiasing methods improve out-of-distribution (OOD) performance on multiple +multimodal datasets without sacrificing in-distribution performance. +Additionally, we introduce a novel metric to quantify the sufficiency of +spurious features in models' predictions that further demonstrates the +effectiveness of our proposed methods. Our code is available at: +https://github.com/Vaidehi99/CausalInfoMin + +
+
+ comment: EMNLP 2023 Findings (16 pages) +
+
+
+
+
+ + ☆ Mitigating Object Hallucinations in Large Vision-Language Models through + Visual Contrastive Decoding + + +
+ Large Vision-Language Models (LVLMs) have advanced considerably, intertwining +visual recognition and language understanding to generate content that is not +only coherent but also contextually attuned. Despite their success, LVLMs still +suffer from the issue of object hallucinations, where models generate plausible +yet incorrect outputs that include objects that do not exist in the images. To +mitigate this issue, we introduce Visual Contrastive Decoding (VCD), a simple +and training-free method that contrasts output distributions derived from +original and distorted visual inputs. The proposed VCD effectively reduces the +over-reliance on statistical bias and unimodal priors, two essential causes of +object hallucinations. This adjustment ensures the generated content is closely +grounded to visual inputs, resulting in contextually accurate outputs. Our +experiments show that VCD, without either additional training or the usage of +external tools, significantly mitigates the object hallucination issue across +different LVLM families. Beyond mitigating object hallucinations, VCD also +excels in general LVLM benchmarks, highlighting its wide-ranging applicability. + +
+
+
+
+
+ + ☆ Optimisation-Based Multi-Modal Semantic Image Editing + + +
+ Image editing affords increased control over the aesthetics and content of +generated images. Pre-existing works focus predominantly on text-based +instructions to achieve desired image modifications, which limit edit precision +and accuracy. In this work, we propose an inference-time editing optimisation, +designed to extend beyond textual edits to accommodate multiple editing +instruction types (e.g. spatial layout-based; pose, scribbles, edge maps). We +propose to disentangle the editing task into two competing subtasks: successful +local image modifications and global content consistency preservation, where +subtasks are guided through two dedicated loss functions. By allowing to adjust +the influence of each loss function, we build a flexible editing solution that +can be adjusted to user preferences. We evaluate our method using text, pose +and scribble edit conditions, and highlight our ability to achieve complex +edits, through both qualitative and quantitative experiments. + +
+
+
+
+
+ + ☆ The Falcon Series of Open Language Models + + +
+ We introduce the Falcon series: 7B, 40B, and 180B parameters causal +decoder-only models trained on a diverse high-quality corpora predominantly +assembled from web data. The largest model, Falcon-180B, has been trained on +over 3.5 trillion tokens of text--the largest openly documented pretraining +run. Falcon-180B significantly outperforms models such as PaLM or Chinchilla, +and improves upon concurrently developed models such as LLaMA 2 or +Inflection-1. It nears the performance of PaLM-2-Large at a reduced pretraining +and inference cost, making it, to our knowledge, one of the three best language +models in the world along with GPT-4 and PaLM-2-Large. We report detailed +evaluations, as well as a deep dive into the methods and custom tooling +employed to pretrain Falcon. Notably, we report on our custom distributed +training codebase, allowing us to efficiently pretrain these models on up to +4,096 A100s on cloud AWS infrastructure with limited interconnect. We release a +600B tokens extract of our web dataset, as well as the Falcon-7/40/180B models +under a permissive license to foster open-science and accelerate the +development of an open ecosystem of large language models. + +
+
+
+
+
+ + ☆ A Benchmark for Evaluating Machine Translation Metrics on Dialects + Without Standard Orthography + + +
+ For sensible progress in natural language processing, it is important that we +are aware of the limitations of the evaluation metrics we use. In this work, we +evaluate how robust metrics are to non-standardized dialects, i.e. spelling +differences in language varieties that do not have a standard orthography. To +investigate this, we collect a dataset of human translations and human +judgments for automatic machine translations from English to two Swiss German +dialects. We further create a challenge set for dialect variation and benchmark +existing metrics' performances. Our results show that existing metrics cannot +reliably evaluate Swiss German text generation outputs, especially on segment +level. We propose initial design adaptations that increase robustness in the +face of non-standardized dialects, although there remains much room for further +improvement. The dataset, code, and models are available here: +https://github.com/textshuttle/dialect_eval + +
+
+ comment: WMT 2023 Research Paper +
+
+
+
+
+ + ☆ RELIC: Investigating Large Language Model Responses using + Self-Consistency + + +
+ Large Language Models (LLMs) are notorious for blending fact with fiction and +generating non-factual content, known as hallucinations. To tackle this +challenge, we propose an interactive system that helps users obtain insights +into the reliability of the generated text. Our approach is based on the idea +that the self-consistency of multiple samples generated by the same LLM relates +to its confidence in individual claims in the generated texts. Using this idea, +we design RELIC, an interactive system that enables users to investigate and +verify semantic-level variations in multiple long-form responses. This allows +users to recognize potentially inaccurate information in the generated text and +make necessary corrections. From a user study with ten participants, we +demonstrate that our approach helps users better verify the reliability of the +generated text. We further summarize the design implications and lessons +learned from this research for inspiring future studies on reliable human-LLM +interactions. + +
+
+
+
+
+ + ☆ The Claire French Dialogue Dataset + + +
+ We present the Claire French Dialogue Dataset (CFDD), a resource created by +members of LINAGORA Labs in the context of the OpenLLM France initiative. CFDD +is a corpus containing roughly 160 million words from transcripts and stage +plays in French that we have assembled and publicly released in an effort to +further the development of multilingual, open source language models. This +paper describes the 24 individual corpora of which CFDD is composed and +provides links and citations to their original sources. It also provides our +proposed breakdown of the full CFDD dataset into eight categories of subcorpora +and describes the process we followed to standardize the format of the final +dataset. We conclude with a discussion of similar work and future directions. + +
+
+
+
+
+ + ☆ Beyond Hallucinations: Enhancing LVLMs through Hallucination-Aware + Direct Preference Optimization + + +
+ Multimodal large language models have made significant advancements in recent +years, yet they still suffer from a common issue known as the "hallucination +problem" where the models generate textual descriptions that contain inaccurate +or non-existent content from the image. To address this issue, this paper +introduces a novel strategy: Hallucination-Aware Direct Preference Optimization +(HA-DPO). Our approach treats the hallucination problem as a unique preference +selection issue, where the model is trained to favor the non-hallucinating +response when presented with two responses of the same image (one accurate and +one hallucinating). This paper also presents an efficient process for +constructing hallucination sample pairs to ensure high-quality, +style-consistent pairs for stable HA-DPO training. We applied this strategy to +two mainstream multimodal models, and the results showed a significant +reduction in the hallucination problem and an enhancement in the models' +generalization capabilities. With HA-DPO, the MiniGPT-4 model demonstrates +significant advancements: POPE accuracy increases from 51.13% to 85.66% (34.5% +absolute improvement), and the MME score escalates from 968.58 to 1365.76 (41% +relative improvement). The code, models, and datasets will be made publicly +available. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ CharacterGLM: Customizing Chinese Conversational AI Characters with + Large Language Models + + +
+ In this paper, we present CharacterGLM, a series of models built upon +ChatGLM, with model sizes ranging from 6B to 66B parameters. Our CharacterGLM +is designed for generating Character-based Dialogues (CharacterDial), which +aims to equip a conversational AI system with character customization for +satisfying people's inherent social desires and emotional needs. On top of +CharacterGLM, we can customize various AI characters or social agents by +configuring their attributes (identities, interests, viewpoints, experiences, +achievements, social relationships, etc.) and behaviors (linguistic features, +emotional expressions, interaction patterns, etc.). Our model outperforms most +mainstream close-source large langauge models, including the GPT series, +especially in terms of consistency, human-likeness, and engagement according to +manual evaluations. We will release our 6B version of CharacterGLM and a subset +of training data to facilitate further research development in the direction of +character-based dialogue generation. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Large Language Models Suffer From Their Own Output: An Analysis of the + Self-Consuming Training Loop + + +
+ Large language models (LLM) have become state of the art in many benchmarks +and conversational LLM applications like ChatGPT are now widely used by the +public. Those LLMs can be used to generate large amounts of content which is +posted on the internet to various platforms. As LLMs are trained on datasets +usually collected from the internet, this LLM-generated content might be used +to train the next generation of LLMs. Therefore, a self-consuming training loop +emerges in which new LLM generations are trained on the output from the +previous generations. We empirically study this self-consuming training loop +using a novel dataset to analytically and accurately measure quality and +diversity of generated outputs. We find that this self-consuming training loop +initially improves both quality and diversity. However, after a few generations +the output inevitably degenerates in diversity. We find that the rate of +degeneration depends on the proportion of real and generated data. + +
+
+
+
+
+ + ☆ A Survey of the Evolution of Language Model-Based Dialogue Systems + + +
+ Dialogue systems, including task-oriented_dialogue_system (TOD) and +open-domain_dialogue_system (ODD), have undergone significant transformations, +with language_models (LM) playing a central role. This survey delves into the +historical trajectory of dialogue systems, elucidating their intricate +relationship with advancements in language models by categorizing this +evolution into four distinct stages, each marked by pivotal LM breakthroughs: +1) Early_Stage: characterized by statistical LMs, resulting in rule-based or +machine-learning-driven dialogue_systems; 2) Independent development of TOD and +ODD based on neural_language_models (NLM; e.g., LSTM and GRU), since NLMs lack +intrinsic knowledge in their parameters; 3) fusion between different types of +dialogue systems with the advert of pre-trained_language_models (PLMs), +starting from the fusion between four_sub-tasks_within_TOD, and then +TOD_with_ODD; and 4) current LLM-based_dialogue_system, wherein LLMs can be +used to conduct TOD and ODD seamlessly. Thus, our survey provides a +chronological perspective aligned with LM breakthroughs, offering a +comprehensive review of state-of-the-art research outcomes. What's more, we +focus on emerging topics and discuss open challenges, providing valuable +insights into future directions for LLM-based_dialogue_systems. Through this +exploration, we pave the way for a deeper_comprehension of the evolution, +guiding future developments in LM-based dialogue_systems. + +
+
+
+
+
+ + ☆ Evaluating Optimal Reference Translations + + +
+ The overall translation quality reached by current machine translation (MT) +systems for high-resourced language pairs is remarkably good. Standard methods +of evaluation are not suitable nor intended to uncover the many translation +errors and quality deficiencies that still persist. Furthermore, the quality of +standard reference translations is commonly questioned and comparable quality +levels have been reached by MT alone in several language pairs. Navigating +further research in these high-resource settings is thus difficult. In this +article, we propose a methodology for creating more reliable document-level +human reference translations, called "optimal reference translations," with the +simple aim to raise the bar of what should be deemed "human translation +quality." We evaluate the obtained document-level optimal reference +translations in comparison with "standard" ones, confirming a significant +quality increase and also documenting the relationship between evaluation and +translation editing. + +
+
+ comment: To appear in Natural Language Engineering 2024 +
+
+
+
+
+ + ☆ Radiology-Aware Model-Based Evaluation Metric for Report Generation + + +
+ We propose a new automated evaluation metric for machine-generated radiology +reports using the successful COMET architecture adapted for the radiology +domain. We train and publish four medically-oriented model checkpoints, +including one trained on RadGraph, a radiology knowledge graph. Our results +show that our metric correlates moderately to high with established metrics +such as BERTscore, BLEU, and CheXbert scores. Furthermore, we demonstrate that +one of our checkpoints exhibits a high correlation with human judgment, as +assessed using the publicly available annotations of six board-certified +radiologists, using a set of 200 reports. We also performed our own analysis +gathering annotations with two radiologists on a collection of 100 reports. The +results indicate the potential effectiveness of our method as a +radiology-specific evaluation metric. The code, data, and model checkpoints to +reproduce our findings will be publicly available. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ LLMs for Science: Usage for Code Generation and Data Analysis + + +
+ Large language models (LLMs) have been touted to enable increased +productivity in many areas of today's work life. Scientific research as an area +of work is no exception: the potential of LLM-based tools to assist in the +daily work of scientists has become a highly discussed topic across +disciplines. However, we are only at the very onset of this subject of study. +It is still unclear how the potential of LLMs will materialise in research +practice. With this study, we give first empirical evidence on the use of LLMs +in the research process. We have investigated a set of use cases for LLM-based +tools in scientific research, and conducted a first study to assess to which +degree current tools are helpful. In this paper we report specifically on use +cases related to software engineering, such as generating application code and +developing scripts for data analytics. While we studied seemingly simple use +cases, results across tools differ significantly. Our results highlight the +promise of LLM-based tools in general, yet we also observe various issues, +particularly regarding the integrity of the output these tools provide. + +
+
+ comment: Preprint; In Submission +
+
+
+
+
+ + ☆ Entity-Aspect-Opinion-Sentiment Quadruple Extraction for Fine-grained + Sentiment Analysis + + +
+ Product reviews often contain a large number of implicit aspects and +object-attribute co-existence cases. Unfortunately, many existing studies in +Aspect-Based Sentiment Analysis (ABSA) have overlooked this issue, which can +make it difficult to extract opinions comprehensively and fairly. In this +paper, we propose a new task called Entity-Aspect-Opinion-Sentiment Quadruple +Extraction (EASQE), which aims to hierarchically decompose aspect terms into +entities and aspects to avoid information loss, non-exclusive annotations, and +opinion misunderstandings in ABSA tasks. To facilitate research in this new +task, we have constructed four datasets (Res14-EASQE, Res15-EASQE, Res16-EASQE, +and Lap14-EASQE) based on the SemEval Restaurant and Laptop datasets. We have +also proposed a novel two-stage sequence-tagging based Trigger-Opinion +framework as the baseline for the EASQE task. Empirical evaluations show that +our Trigger-Opinion framework can generate satisfactory EASQE results and can +also be applied to other ABSA tasks, significantly outperforming +state-of-the-art methods. We have made the four datasets and source code of +Trigger-Opinion publicly available to facilitate further research in this area. + +
+
+
+
+
+ + ☆ A Distribution-Based Threshold for Determining Sentence Similarity + + +
+ We hereby present a solution to a semantic textual similarity (STS) problem +in which it is necessary to match two sentences containing, as the only +distinguishing factor, highly specific information (such as names, addresses, +identification codes), and from which we need to derive a definition for when +they are similar and when they are not. The solution revolves around the use of +a neural network, based on the siamese architecture, to create the +distributions of the distances between similar and dissimilar pairs of +sentences. The goal of these distributions is to find a discriminating factor, +that we call "threshold", which represents a well-defined quantity that can be +used to distinguish vector distances of similar pairs from vector distances of +dissimilar pairs in new predictions and later analyses. In addition, we +developed a way to score the predictions by combining attributes from both the +distributions' features and the way the distance function works. Finally, we +generalize the results showing that they can be transferred to a wider range of +domains by applying the system discussed to a well-known and widely used +benchmark dataset for STS problems. + +
+
+
+
+
+ + ☆ Text2Tree: Aligning Text Representation to the Label Tree Hierarchy for + Imbalanced Medical Classification EMNLP 2023 + + +
+ Deep learning approaches exhibit promising performances on various text +tasks. However, they are still struggling on medical text classification since +samples are often extremely imbalanced and scarce. Different from existing +mainstream approaches that focus on supplementary semantics with external +medical information, this paper aims to rethink the data challenges in medical +texts and present a novel framework-agnostic algorithm called Text2Tree that +only utilizes internal label hierarchy in training deep learning models. We +embed the ICD code tree structure of labels into cascade attention modules for +learning hierarchy-aware label representations. Two new learning schemes, +Similarity Surrogate Learning (SSL) and Dissimilarity Mixup Learning (DML), are +devised to boost text classification by reusing and distinguishing samples of +other labels following the label representation hierarchy, respectively. +Experiments on authoritative public datasets and real-world medical records +show that our approach stably achieves superior performances over classical and +advanced imbalanced classification methods. + +
+
+ comment: EMNLP 2023 Findings. Code: https://github.com/jyansir/Text2Tree +
+
+
+
+
+ + ☆ Scaling Political Texts with ChatGPT + + +
+ We use GPT-4 to obtain position estimates of political texts in continuous +spaces. We develop and validate a new approach by positioning British party +manifestos on the economic, social, and immigration policy dimensions and +tweets by members of the US Congress on the left-right ideological spectrum. +For the party manifestos, the correlation between the positions produced by +GPT-4 and experts is 93% or higher, a performance similar to or better than +that obtained with crowdsourced position estimates. For individual tweets, the +positions obtained with GPT-4 achieve a correlation of 91% with crowdsourced +position estimates. For senators of the 117th US Congress, the positions +obtained with GPT-4 achieve a correlation of 97% with estimates based on roll +call votes and of 96% with those based on campaign funding. Correlations are +also substantial within party, indicating that position estimates produced with +GPT-4 capture within-party differences between senators. Overall, using GPT-4 +for ideological scaling is fast, cost-efficient, and reliable. This approach +provides a viable alternative to scaling by both expert raters and +crowdsourcing. + +
+
+
+
+
+ + ☆ On the Long Range Abilities of Transformers + + +
+ Despite their dominance in modern DL and, especially, NLP domains, +transformer architectures exhibit sub-optimal performance on long-range tasks +compared to recent layers that are specifically designed for this purpose. In +this work, drawing inspiration from key attributes of long-range layers, such +as state-space layers, linear RNN layers, and global convolution layers, we +demonstrate that minimal modifications to the transformer architecture can +significantly enhance performance on the Long Range Arena (LRA) benchmark, thus +narrowing the gap with these specialized layers. We identify that two key +principles for long-range tasks are (i) incorporating an inductive bias towards +smoothness, and (ii) locality. As we show, integrating these ideas into the +attention mechanism improves results with a negligible amount of additional +computation and without any additional trainable parameters. Our theory and +experiments also shed light on the reasons for the inferior performance of +transformers on long-range tasks and identify critical properties that are +essential for successfully capturing long-range dependencies. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ MedGen: A Python Natural Language Processing Toolkit for Medical Text + Processing + + +
+ This study introduces MedGen, a comprehensive natural language processing +(NLP) toolkit designed for medical text processing. MedGen is tailored for +biomedical researchers and healthcare professionals with an easy-to-use, +all-in-one solution that requires minimal programming expertise. It includes +(1) Generative Functions: For the first time, MedGen includes four advanced +generative functions: question answering, text summarization, text +simplification, and machine translation; (2) Basic NLP Functions: MedGen +integrates 12 essential NLP functions such as word tokenization and sentence +segmentation; and (3) Query and Search Capabilities: MedGen provides +user-friendly query and search functions on text corpora. We fine-tuned 32 +domain-specific language models, evaluated them thoroughly on 24 established +benchmarks and conducted manual reviews with clinicians. Additionally, we +expanded our toolkit by introducing query and search functions, while also +standardizing and integrating functions from third-party libraries. The +toolkit, its models, and associated data are publicly available via +https://github.com/Yale-LILY/MedGen. + +
+
+ comment: 5 figures, 4 tables +
+
+
+
+
+ + ☆ Recognizing Conditional Causal Relationships about Emotions and Their + Corresponding Conditions + + +
+ The study of causal relationships between emotions and causes in texts has +recently received much attention. Most works focus on extracting causally +related clauses from documents. However, none of these works has considered +that the causal relationships among the extracted emotion and cause clauses can +only be valid under some specific context clauses. To highlight the context in +such special causal relationships, we propose a new task to determine whether +or not an input pair of emotion and cause has a valid causal relationship under +different contexts and extract the specific context clauses that participate in +the causal relationship. Since the task is new for which no existing dataset is +available, we conduct manual annotation on a benchmark dataset to obtain the +labels for our tasks and the annotations of each context clause's type that can +also be used in some other applications. We adopt negative sampling to +construct the final dataset to balance the number of documents with and without +causal relationships. Based on the constructed dataset, we propose an +end-to-end multi-task framework, where we design two novel and general modules +to handle the two goals of our task. Specifically, we propose a context masking +module to extract the context clauses participating in the causal +relationships. We propose a prediction aggregation module to fine-tune the +prediction results according to whether the input emotion and causes depend on +specific context clauses. Results of extensive comparative experiments and +ablation studies demonstrate the effectiveness and generality of our proposed +framework. + +
+
+
+
+
+ + ☆ Evaluation of dynamic characteristics of power grid based on GNN and + application on knowledge graph + + +
+ A novel method for detecting faults in power grids using a graph neural +network (GNN) has been developed, aimed at enhancing intelligent fault +diagnosis in network operation and maintenance. This GNN-based approach +identifies faulty nodes within the power grid through a specialized electrical +feature extraction model coupled with a knowledge graph. Incorporating temporal +data, the method leverages the status of nodes from preceding and subsequent +time periods to aid in current fault detection. To validate the effectiveness +of this GNN in extracting node features, a correlation analysis of the output +features from each node within the neural network layer was conducted. The +results from experiments show that this method can accurately locate fault +nodes in simulated scenarios with a remarkable 99.53% accuracy. Additionally, +the graph neural network's feature modeling allows for a qualitative +examination of how faults spread across nodes, providing valuable insights for +analyzing fault nodes. + +
+
+
+
+
+ + ☆ StyleCap: Automatic Speaking-Style Captioning from Speech Based on + Speech and Language Self-supervised Learning Models ICASSP 2024 + + +
+ We propose StyleCap, a method to generate natural language descriptions of +speaking styles appearing in speech. Although most of conventional techniques +for para-/non-linguistic information recognition focus on the category +classification or the intensity estimation of pre-defined labels, they cannot +provide the reasoning of the recognition result in an interpretable manner. As +a first step towards an end-to-end method for generating speaking-style prompts +from speech, i.e., automatic speaking-style captioning, StyleCap uses paired +data of speech and natural language descriptions to train neural networks that +predict prefix vectors fed into a large language model (LLM)-based text decoder +from a speech representation vector. We explore an appropriate text decoder and +speech feature representation suitable for this new task. The experimental +results demonstrate that our StyleCap leveraging richer LLMs for the text +decoder, speech self-supervised learning (SSL) features, and sentence +rephrasing augmentation improves the accuracy and diversity of generated +speaking-style captions. Samples of speaking-style captions generated by our +StyleCap are publicly available. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ Enhancing Human Persuasion With Large Language Models + + +
+ Although large language models (LLMs) are reshaping various aspects of human +life, our current understanding of their impacts remains somewhat constrained. +Here we investigate the impact of LLMs on human communication, in the context +of consumer complaints in the financial industry. Employing an AI detection +tool on more than 780K complaints gathered by the Consumer Financial Protection +Bureau (CFPB), we find evidence of LLM usage in the writing of complaints - +shortly after the release of ChatGPT. Our analyses reveal that LLM usage is +positively correlated with the likelihood of obtaining desirable outcomes +(i.e., offer of relief from financial firms) and suggest that this positive +correlation may be partly due to the linguistic features improved by LLMs. We +test this conjecture with a preregistered experiment, which reveals results +consistent with those from observational studies: Consumer complaints written +with ChatGPT for improved linguistic qualities were more likely to receive +hypothetical relief offers than the original consumer complaints, demonstrating +the LLM's ability to enhance message persuasiveness in human communication. +Being some of the earliest empirical evidence on LLM usage for enhancing +persuasion, our results highlight the transformative potential of LLMs in human +communication. + +
+
+
+
+
+ + ☆ Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case + Study in Medicine + + +
+ Generalist foundation models such as GPT-4 have displayed surprising +capabilities in a wide variety of domains and tasks. Yet, there is a prevalent +assumption that they cannot match specialist capabilities of fine-tuned models. +For example, most explorations to date on medical competency benchmarks have +leveraged domain-specific training, as exemplified by efforts on BioGPT and +Med-PaLM. We build on a prior study of GPT-4's capabilities on medical +challenge benchmarks in the absence of special training. Rather than using +simple prompting to highlight the model's out-of-the-box capabilities, we +perform a systematic exploration of prompt engineering. We find that prompting +innovation can unlock deeper specialist capabilities and show that GPT-4 easily +tops prior leading results for medical benchmarks. The prompting methods we +explore are general purpose, and make no specific use of domain expertise, +removing the need for expert-curated content. Our experimental design carefully +controls for overfitting during the prompt engineering process. We introduce +Medprompt, based on a composition of several prompting strategies. With +Medprompt, GPT-4 achieves state-of-the-art results on all nine of the benchmark +datasets in the MultiMedQA suite. The method outperforms leading specialist +models such as Med-PaLM 2 by a significant margin with an order of magnitude +fewer calls to the model. Steering GPT-4 with Medprompt achieves a 27% +reduction in error rate on the MedQA dataset over the best methods to date +achieved with specialist models and surpasses a score of 90% for the first +time. Beyond medical problems, we show the power of Medprompt to generalize to +other domains and provide evidence for the broad applicability of the approach +via studies of the strategy on exams in electrical engineering, machine +learning, philosophy, accounting, law, nursing, and clinical psychology. + +
+
+ comment: 21 pages, 7 figures +
+
+
+
+
+ + ☆ Exo2EgoDVC: Dense Video Captioning of Egocentric Procedural Activities + Using Web Instructional Videos + + +
+ We propose a novel benchmark for cross-view knowledge transfer of dense video +captioning, adapting models from web instructional videos with exocentric views +to an egocentric view. While dense video captioning (predicting time segments +and their captions) is primarily studied with exocentric videos (e.g., +YouCook2), benchmarks with egocentric videos are restricted due to data +scarcity. To overcome the limited video availability, transferring knowledge +from abundant exocentric web videos is demanded as a practical approach. +However, learning the correspondence between exocentric and egocentric views is +difficult due to their dynamic view changes. The web videos contain mixed views +focusing on either human body actions or close-up hand-object interactions, +while the egocentric view is constantly shifting as the camera wearer moves. +This necessitates the in-depth study of cross-view transfer under complex view +changes. In this work, we first create a real-life egocentric dataset (EgoYC2) +whose captions are shared with YouCook2, enabling transfer learning between +these datasets assuming their ground-truth is accessible. To bridge the view +gaps, we propose a view-invariant learning method using adversarial training in +both the pre-training and fine-tuning stages. While the pre-training is +designed to learn invariant features against the mixed views in the web videos, +the view-invariant fine-tuning further mitigates the view gaps between both +datasets. We validate our proposed method by studying how effectively it +overcomes the view change problem and efficiently transfers the knowledge to +the egocentric domain. Our benchmark pushes the study of the cross-view +transfer into a new task domain of dense video captioning and will envision +methodologies to describe egocentric videos in natural language. + +
+
+
+
+
+ + ☆ CDEval: A Benchmark for Measuring the Cultural Dimensions of Large + Language Models + + +
+ As the scaling of Large Language Models (LLMs) has dramatically enhanced +their capabilities, there has been a growing focus on the alignment problem to +ensure their responsible and ethical use. While existing alignment efforts +predominantly concentrate on universal values such as the HHH principle, the +aspect of culture, which is inherently pluralistic and diverse, has not +received adequate attention. This work introduces a new benchmark, CDEval, +aimed at evaluating the cultural dimensions of LLMs. CDEval is constructed by +incorporating both GPT-4's automated generation and human verification, +covering six cultural dimensions across seven domains. Our comprehensive +experiments provide intriguing insights into the culture of mainstream LLMs, +highlighting both consistencies and variations across different dimensions and +domains. The findings underscore the importance of integrating cultural +considerations in LLM development, particularly for applications in diverse +cultural settings. Through CDEval, we aim to broaden the horizon of LLM +alignment research by including cultural dimensions, thus providing a more +holistic framework for the future development and evaluation of LLMs. This +benchmark serves as a valuable resource for cultural studies in LLMs, paving +the way for more culturally aware and sensitive models. + +
+
+ comment: Work in process +
+
+
+
+
+ + ☆ Does VLN Pretraining Work with Nonsensical or Irrelevant Instructions? + + +
+ Data augmentation via back-translation is common when pretraining +Vision-and-Language Navigation (VLN) models, even though the generated +instructions are noisy. But: does that noise matter? We find that nonsensical +or irrelevant language instructions during pretraining can have little effect +on downstream performance for both HAMT and VLN-BERT on R2R, and is still +better than only using clean, human data. To underscore these results, we +concoct an efficient augmentation method, Unigram + Object, which generates +nonsensical instructions that nonetheless improve downstream performance. Our +findings suggest that what matters for VLN R2R pretraining is the quantity of +visual trajectories, not the quality of instructions. + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ☆ RETSim: Resilient and Efficient Text Similarity + + +
+ This paper introduces RETSim (Resilient and Efficient Text Similarity), a +lightweight, multilingual deep learning model trained to produce robust metric +embeddings for near-duplicate text retrieval, clustering, and dataset +deduplication tasks. We demonstrate that RETSim is significantly more robust +and accurate than MinHash and neural text embeddings, achieving new +state-of-the-art performance on dataset deduplication, adversarial text +retrieval benchmarks, and spam clustering tasks. We also introduce the W4NT3D +benchmark (Wiki-40B 4dversarial Near-T3xt Dataset) for evaluating multilingual, +near-duplicate text retrieval capabilities under adversarial settings. RETSim +and the W4NT3D benchmark are open-sourced under the MIT License at +https://github.com/google/unisim. + +
+
+
+
+
+ + ☆ Quantifying the redundancy between prosody and text EMNLP + + +
+ Prosody -- the suprasegmental component of speech, including pitch, loudness, +and tempo -- carries critical aspects of meaning. However, the relationship +between the information conveyed by prosody vs. by the words themselves remains +poorly understood. We use large language models (LLMs) to estimate how much +information is redundant between prosody and the words themselves. Using a +large spoken corpus of English audiobooks, we extract prosodic features aligned +to individual words and test how well they can be predicted from LLM +embeddings, compared to non-contextual word embeddings. We find a high degree +of redundancy between the information carried by the words and prosodic +information across several prosodic features, including intensity, duration, +pauses, and pitch contours. Furthermore, a word's prosodic information is +redundant with both the word itself and the context preceding as well as +following it. Still, we observe that prosodic features can not be fully +predicted from text, suggesting that prosody carries information above and +beyond the words. Along with this paper, we release a general-purpose data +processing pipeline for quantifying the relationship between linguistic +information and extra-linguistic features. + +
+
+ comment: Published at The 2023 Conference on Empirical Methods in Natural + Language Processing (EMNLP) +
+
+
+
+
+ + ☆ War and Peace (WarAgent): Large Language Model-based Multi-Agent + Simulation of World Wars + + +
+ Can we avoid wars at the crossroads of history? This question has been +pursued by individuals, scholars, policymakers, and organizations throughout +human history. In this research, we attempt to answer the question based on the +recent advances of Artificial Intelligence (AI) and Large Language Models +(LLMs). We propose \textbf{WarAgent}, an LLM-powered multi-agent AI system, to +simulate the participating countries, their decisions, and the consequences, in +historical international conflicts, including the World War I (WWI), the World +War II (WWII), and the Warring States Period (WSP) in Ancient China. By +evaluating the simulation effectiveness, we examine the advancements and +limitations of cutting-edge AI systems' abilities in studying complex +collective human behaviors such as international conflicts under diverse +settings. In these simulations, the emergent interactions among agents also +offer a novel perspective for examining the triggers and conditions that lead +to war. Our findings offer data-driven and AI-augmented insights that can +redefine how we approach conflict resolution and peacekeeping strategies. The +implications stretch beyond historical analysis, offering a blueprint for using +AI to understand human history and possibly prevent future international +conflicts. Code and data are available at +\url{https://github.com/agiresearch/WarAgent}. + +
+
+ comment: 40 pages, 7 figures +
+
+
+
+
+ + ☆ General-Purpose vs. Domain-Adapted Large Language Models for Extraction + of Data from Thoracic Radiology Reports + + +
+ Radiologists produce unstructured data that could be valuable for clinical +care when consumed by information systems. However, variability in style limits +usage. Study compares performance of system using domain-adapted language model +(RadLing) and general-purpose large language model (GPT-4) in extracting common +data elements (CDE) from thoracic radiology reports. Three radiologists +annotated a retrospective dataset of 1300 thoracic reports (900 training, 400 +test) and mapped to 21 pre-selected relevant CDEs. RadLing was used to generate +embeddings for sentences and identify CDEs using cosine-similarity, which were +mapped to values using light-weight mapper. GPT-4 system used OpenAI's +general-purpose embeddings to identify relevant CDEs and used GPT-4 to map to +values. The output CDE:value pairs were compared to the reference standard; an +identical match was considered true positive. Precision (positive predictive +value) was 96% (2700/2824) for RadLing and 99% (2034/2047) for GPT-4. Recall +(sensitivity) was 94% (2700/2876) for RadLing and 70% (2034/2887) for GPT-4; +the difference was statistically significant (P<.001). RadLing's domain-adapted +embeddings were more sensitive in CDE identification (95% vs 71%) and its +light-weight mapper had comparable precision in value assignment (95.4% vs +95.0%). RadLing system exhibited higher performance than GPT-4 system in +extracting CDEs from radiology reports. RadLing system's domain-adapted +embeddings outperform general-purpose embeddings from OpenAI in CDE +identification and its light-weight value mapper achieves comparable precision +to large GPT-4. RadLing system offers operational advantages including local +deployment and reduced runtime costs. Domain-adapted RadLing system surpasses +GPT-4 system in extracting common data elements from radiology reports, while +providing benefits of local deployment and lower costs. + +
+
+
+
+
+ + ☆ Pragmatic Radiology Report Generation + + +
+ When pneumonia is not found on a chest X-ray, should the report describe this +negative observation or omit it? We argue that this question cannot be answered +from the X-ray alone and requires a pragmatic perspective, which captures the +communicative goal that radiology reports serve between radiologists and +patients. However, the standard image-to-text formulation for radiology report +generation fails to incorporate such pragmatic intents. Following this +pragmatic perspective, we demonstrate that the indication, which describes why +a patient comes for an X-ray, drives the mentions of negative observations and +introduce indications as additional input to report generation. With respect to +the output, we develop a framework to identify uninferable information from the +image as a source of model hallucinations, and limit them by cleaning +groundtruth reports. Finally, we use indications and cleaned groundtruth +reports to develop pragmatic models, and show that they outperform existing +methods not only in new pragmatics-inspired metrics (+4.3 Negative F1) but also +in standard metrics (+6.3 Positive F1 and +11.0 BLEU-2). + +
+
+ comment: 18 pages, 1 figure, 18 tables. Code at + https://github.com/ChicagoHAI/llm_radiology +
+
+
+
+
+ + ☆ UniIR: Training and Benchmarking Universal Multimodal Information + Retrievers + + +
+ Existing information retrieval (IR) models often assume a homogeneous format, +limiting their applicability to diverse user needs, such as searching for +images with text descriptions, searching for a news article with a headline +image, or finding a similar photo with a query image. To approach such +different information-seeking demands, we introduce UniIR, a unified +instruction-guided multimodal retriever capable of handling eight distinct +retrieval tasks across modalities. UniIR, a single retrieval system jointly +trained on ten diverse multimodal-IR datasets, interprets user instructions to +execute various retrieval tasks, demonstrating robust performance across +existing datasets and zero-shot generalization to new tasks. Our experiments +highlight that multi-task training and instruction tuning are keys to UniIR's +generalization ability. Additionally, we construct the M-BEIR, a multimodal +retrieval benchmark with comprehensive results, to standardize the evaluation +of universal multimodal information retrieval. + +
+
+ comment: Our code and dataset are available on this project page: + https://tiger-ai-lab.github.io/UniIR/ +
+
+
+
+
+ + ☆ Reason out Your Layout: Evoking the Layout Master from Large Language + Models for Text-to-Image Synthesis + + +
+ Recent advancements in text-to-image (T2I) generative models have shown +remarkable capabilities in producing diverse and imaginative visuals based on +text prompts. Despite the advancement, these diffusion models sometimes +struggle to translate the semantic content from the text into images entirely. +While conditioning on the layout has shown to be effective in improving the +compositional ability of T2I diffusion models, they typically require manual +layout input. In this work, we introduce a novel approach to improving T2I +diffusion models using Large Language Models (LLMs) as layout generators. Our +method leverages the Chain-of-Thought prompting of LLMs to interpret text and +generate spatially reasonable object layouts. The generated layout is then used +to enhance the generated images' composition and spatial accuracy. Moreover, we +propose an efficient adapter based on a cross-attention mechanism, which +explicitly integrates the layout information into the stable diffusion models. +Our experiments demonstrate significant improvements in image quality and +layout accuracy, showcasing the potential of LLMs in augmenting generative +image models. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ ClimateX: Do LLMs Accurately Assess Human Expert Confidence in Climate + Statements? NeurIPS + 2023 + + +
+ Evaluating the accuracy of outputs generated by Large Language Models (LLMs) +is especially important in the climate science and policy domain. We introduce +the Expert Confidence in Climate Statements (ClimateX) dataset, a novel, +curated, expert-labeled dataset consisting of 8094 climate statements collected +from the latest Intergovernmental Panel on Climate Change (IPCC) reports, +labeled with their associated confidence levels. Using this dataset, we show +that recent LLMs can classify human expert confidence in climate-related +statements, especially in a few-shot learning setting, but with limited (up to +47%) accuracy. Overall, models exhibit consistent and significant +over-confidence on low and medium confidence statements. We highlight +implications of our results for climate communication, LLMs evaluation +strategies, and the use of LLMs in information retrieval systems. + +
+
+ comment: Tackling Climate Change with Machine Learning workshop at NeurIPS + 2023 +
+
+
+
+
+ + ☆ PEA-Diffusion: Parameter-Efficient Adapter with Knowledge Distillation + in non-English Text-to-Image Generation + + +
+ Text-to-image diffusion models are well-known for their ability to generate +realistic images based on textual prompts. However, the existing works have +predominantly focused on English, lacking support for non-English text-to-image +models. The most commonly used translation methods cannot solve the generation +problem related to language culture, while training from scratch on a specific +language dataset is prohibitively expensive. In this paper, we are inspired to +propose a simple plug-and-play language transfer method based on knowledge +distillation. All we need to do is train a lightweight MLP-like +parameter-efficient adapter (PEA) with only 6M parameters under teacher +knowledge distillation along with a small parallel data corpus. We are +surprised to find that freezing the parameters of UNet can still achieve +remarkable performance on the language-specific prompt evaluation set, +demonstrating that PEA can stimulate the potential generation ability of the +original UNet. Additionally, it closely approaches the performance of the +English text-to-image model on a general prompt evaluation set. Furthermore, +our adapter can be used as a plugin to achieve significant results in +downstream tasks in cross-lingual text-to-image generation. Code will be +available at: https://github.com/OPPO-Mente-Lab/PEA-Diffusion + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ A Brief History of Prompt: Leveraging Language Models. (Through Advanced + Prompting) + + +
+ This paper presents a comprehensive exploration of the evolution of prompt +engineering and generation in the field of natural language processing (NLP). +Starting from the early language models and information retrieval systems, we +trace the key developments that have shaped prompt engineering over the years. +The introduction of attention mechanisms in 2015 revolutionized language +understanding, leading to advancements in controllability and +context-awareness. Subsequent breakthroughs in reinforcement learning +techniques further enhanced prompt engineering, addressing issues like exposure +bias and biases in generated text. We examine the significant contributions in +2018 and 2019, focusing on fine-tuning strategies, control codes, and +template-based generation. The paper also discusses the growing importance of +fairness, human-AI collaboration, and low-resource adaptation. In 2020 and +2021, contextual prompting and transfer learning gained prominence, while 2022 +and 2023 witnessed the emergence of advanced techniques like unsupervised +pre-training and novel reward shaping. Throughout the paper, we reference +specific research studies that exemplify the impact of various developments on +prompt engineering. The journey of prompt engineering continues, with ethical +considerations being paramount for the responsible and inclusive future of AI +systems. + +
+
+
+
+
+ + ♻ ☆ People Make Better Edits: Measuring the Efficacy of LLM-Generated + Counterfactually Augmented Data for Harmful Language Detection EMNLP'23 + + +
+ NLP models are used in a variety of critical social computing tasks, such as +detecting sexist, racist, or otherwise hateful content. Therefore, it is +imperative that these models are robust to spurious features. Past work has +attempted to tackle such spurious features using training data augmentation, +including Counterfactually Augmented Data (CADs). CADs introduce minimal +changes to existing training data points and flip their labels; training on +them may reduce model dependency on spurious features. However, manually +generating CADs can be time-consuming and expensive. Hence in this work, we +assess if this task can be automated using generative NLP models. We +automatically generate CADs using Polyjuice, ChatGPT, and Flan-T5, and evaluate +their usefulness in improving model robustness compared to manually-generated +CADs. By testing both model performance on multiple out-of-domain test sets and +individual data point efficacy, our results show that while manual CADs are +still the most effective, CADs generated by ChatGPT come a close second. One +key reason for the lower performance of automated methods is that the changes +they introduce are often insufficient to flip the original label. + +
+
+ comment: Preprint of EMNLP'23 paper +
+
+
+
+
+ + ♻ ☆ LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models + + +
+ Quantization is an indispensable technique for serving Large Language Models +(LLMs) and has recently found its way into LoRA fine-tuning. In this work we +focus on the scenario where quantization and LoRA fine-tuning are applied +together on a pre-trained model. In such cases it is common to observe a +consistent gap in the performance on downstream tasks between full fine-tuning +and quantization plus LoRA fine-tuning approach. In response, we propose LoftQ +(LoRA-Fine-Tuning-aware Quantization), a novel quantization framework that +simultaneously quantizes an LLM and finds a proper low-rank initialization for +LoRA fine-tuning. Such an initialization alleviates the discrepancy between the +quantized and full-precision model and significantly improves generalization in +downstream tasks. We evaluate our method on natural language understanding, +question answering, summarization, and natural language generation tasks. +Experiments show that our method is highly effective and outperforms existing +quantization methods, especially in the challenging 2-bit and 2/4-bit mixed +precision regimes. The code is available on https://github.com/yxli2123/LoftQ. + +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ GraphPrompt: Graph-Based Prompt Templates for Biomedical Synonym + Prediction + + +
+ In the expansion of biomedical dataset, the same category may be labeled with +different terms, thus being tedious and onerous to curate these terms. +Therefore, automatically mapping synonymous terms onto the ontologies is +desirable, which we name as biomedical synonym prediction task. Unlike +biomedical concept normalization (BCN), no clues from context can be used to +enhance synonym prediction, making it essential to extract graph features from +ontology. We introduce an expert-curated dataset OBO-syn encompassing 70 +different types of concepts and 2 million curated concept-term pairs for +evaluating synonym prediction methods. We find BCN methods perform weakly on +this task for not making full use of graph information. Therefore, we propose +GraphPrompt, a prompt-based learning approach that creates prompt templates +according to the graphs. GraphPrompt obtained 37.2\% and 28.5\% improvement on +zero-shot and few-shot settings respectively, indicating the effectiveness of +these graph-based prompt templates. We envision that our method GraphPrompt and +OBO-syn dataset can be broadly applied to graph-based NLP tasks, and serve as +the basis for analyzing diverse and accumulating biomedical data. All the data +and codes are avalible at: https://github.com/HanwenXuTHU/GraphPrompt + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation + + +
+ Current AI-based methods do not provide comprehensible physical +interpretations of the utilized data, extracted features, and +predictions/inference operations. As a result, deep learning models trained +using high-resolution satellite imagery lack transparency and explainability +and can be merely seen as a black box, which limits their wide-level adoption. +Experts need help understanding the complex behavior of AI models and the +underlying decision-making process. The explainable artificial intelligence +(XAI) field is an emerging field providing means for robust, practical, and +trustworthy deployment of AI models. Several XAI techniques have been proposed +for image classification tasks, whereas the interpretation of image +segmentation remains largely unexplored. This paper offers to bridge this gap +by adapting the recent XAI classification algorithms and making them usable for +muti-class image segmentation, where we mainly focus on buildings' segmentation +from high-resolution satellite images. To benchmark and compare the performance +of the proposed approaches, we introduce a new XAI evaluation methodology and +metric based on "Entropy" to measure the model uncertainty. Conventional XAI +evaluation methods rely mainly on feeding area-of-interest regions from the +image back to the pre-trained (utility) model and then calculating the average +change in the probability of the target class. Those evaluation metrics lack +the needed robustness, and we show that using Entropy to monitor the model +uncertainty in segmenting the pixels within the target class is more suitable. +We hope this work will pave the way for additional XAI research for image +segmentation and applications in the remote sensing discipline. + +
+
+
+
+
+ + ♻ ☆ Patent Documents to Engineering Design Knowledge Graphs + + +
+ Aimed at supporting knowledge-intensive tasks in the design process, +populating design knowledge from text documents involves the extraction of +triples - head entity :: relationship :: tail entity or h :: r :: t that could +be combined into a knowledge graph representation. As relationships are largely +chosen from ontological or common-sense alternatives, knowledge graphs built +using these depict an approximation or restricted view of design knowledge, +rather than what is explicated in text document. In this article, we present a +data-driven approach to identify and explicate facts (h :: r :: t) from +sentences in patent documents. We create a dataset of 44,227 sentences and +facts, encompassing all patent classifications while also capturing the +variations among patent document sections. Using this dataset, we train taggers +that classify tokens to: 1) identify all entities (h) and relationships (r) and +2) specific relationships (r) for a pair of entities (h :: ___ :: t). While +these taggers are built upon transformer-based sequence classification models, +we evaluate our proposed method against edge classification approaches that use +linear classifiers and graph neural networks, incorporating transformer-based +token embeddings and linguistic features. The simplicity and coverage of the +proposed method enable its application to patent documents at any scale and +variety. Upon deploying an open-source python package, we apply our method to +patent documents related to fan systems. From the knowledge graphs thus +extracted, we explain how facts could be generalised to domain ontologies as +well as be specified to subsystem levels. We also highlight the importance of +knowledge graph representations by retrieving and explicating the knowledge of +key issues in fan systems, while holding a comparative discussion against +opinions from ChatGPT. + +
+
+
+
+
+ + ♻ ☆ A Survey of Graph Meets Large Language Model: Progress and Future + Directions + + +
+ Graph plays a significant role in representing and analyzing complex +relationships in real-world applications such as citation networks, social +networks, and biological data. Recently, Large Language Models (LLMs), which +have achieved tremendous success in various domains, have also been leveraged +in graph-related tasks to surpass traditional Graph Neural Networks (GNNs) +based methods and yield state-of-the-art performance. In this survey, we first +present a comprehensive review and analysis of existing methods that integrate +LLMs with graphs. First of all, we propose a new taxonomy, which organizes +existing methods into three categories based on the role (i.e., enhancer, +predictor, and alignment component) played by LLMs in graph-related tasks. Then +we systematically survey the representative methods along the three categories +of the taxonomy. Finally, we discuss the remaining limitations of existing +studies and highlight promising avenues for future research. The relevant +papers are summarized and will be consistently updated at: +https://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks. + +
+
+ comment: Work in progress; 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Towards End-to-End Embodied Decision Making via Multi-modal Large + Language Model: Explorations with GPT4-Vision and Beyond MDM + + +
+ In this study, we explore the potential of Multimodal Large Language Models +(MLLMs) in improving embodied decision-making processes for agents. While Large +Language Models (LLMs) have been widely used due to their advanced reasoning +skills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual +understanding and reasoning capabilities. We investigate whether +state-of-the-art MLLMs can handle embodied decision-making in an end-to-end +manner and whether collaborations between LLMs and MLLMs can enhance +decision-making. To address these questions, we introduce a new benchmark +called PCA-EVAL, which evaluates embodied decision-making from the perspectives +of Perception, Cognition, and Action. Additionally, we propose HOLMES, a +multi-agent cooperation framework that allows LLMs to leverage MLLMs and APIs +to gather multimodal information for informed decision-making. We compare +end-to-end embodied decision-making and HOLMES on our benchmark and find that +the GPT4-Vision model demonstrates strong end-to-end embodied decision-making +abilities, outperforming GPT4-HOLMES in terms of average decision accuracy +(+3%). However, this performance is exclusive to the latest GPT4-Vision model, +surpassing the open-source state-of-the-art MLLM by 26%. Our results indicate +that powerful MLLMs like GPT4-Vision hold promise for decision-making in +embodied agents, offering new avenues for MLLM research. Code and data are open +at https://github.com/pkunlp-icler/PCA-EVAL/. + +
+
+ comment: FMDM@NeurIPS2023, Code and data: + https://github.com/pkunlp-icler/PCA-EVAL/ +
+
+
+
+
+ + ♻ ☆ Just ClozE! A Novel Framework for Evaluating the Factual Consistency + Faster in Abstractive Summarization + + +
+ The issue of factual consistency in abstractive summarization has received +extensive attention in recent years, and the evaluation of factual consistency +between summary and document has become an important and urgent task. Most of +the current evaluation metrics are adopted from the question answering (QA) or +natural language inference (NLI) task. However, the application of QA-based +metrics is extremely time-consuming in practice while NLI-based metrics are +lack of interpretability. In this paper, we propose a cloze-based evaluation +framework called ClozE and show the great potential of the cloze-based metric. +It inherits strong interpretability from QA, while maintaining the speed of +NLI- level reasoning. We demonstrate that ClozE can reduce the evaluation time +by nearly 96% relative to QA-based metrics while retaining their +interpretability and performance through experiments on six human-annotated +datasets and a meta-evaluation benchmark GO FIGURE (Gabriel et al., 2021). +Finally, we discuss three important facets of ClozE in practice, which further +shows better overall performance of ClozE compared to other metrics. + +
+
+ comment: The manuscript for JAIR +
+
+
+
+
+ + ♻ ☆ CodeChain: Towards Modular Code Generation Through Chain of + Self-revisions with Representative Sub-modules + + +
+ Large Language Models (LLMs) have already become quite proficient at solving +simpler programming tasks like those in HumanEval or MBPP benchmarks. However, +solving more complex and competitive programming tasks is still quite +challenging for these models - possibly due to their tendency to generate +solutions as monolithic code blocks instead of decomposing them into logical +sub-tasks and sub-modules. On the other hand, experienced programmers +instinctively write modularized code with abstraction for solving complex +tasks, often reusing previously developed modules. To address this gap, we +propose CodeChain, a novel framework for inference that elicits modularized +code generation through a chain of self-revisions, each being guided by some +representative sub-modules generated in previous iterations. Concretely, +CodeChain first instructs the LLM to generate modularized codes through +chain-of-thought prompting. Then it applies a chain of self-revisions by +iterating the two steps: 1) extracting and clustering the generated sub-modules +and selecting the cluster representatives as the more generic and re-usable +implementations, and 2) augmenting the original chain-of-thought prompt with +these selected module-implementations and instructing the LLM to re-generate +new modularized solutions. We find that by naturally encouraging the LLM to +reuse the previously developed and verified sub-modules, CodeChain can +significantly boost both modularity as well as correctness of the generated +solutions, achieving relative pass@1 improvements of 35% on APPS and 76% on +CodeContests. It is shown to be effective on both OpenAI LLMs as well as +open-sourced LLMs like WizardCoder. We also conduct comprehensive ablation +studies with different methods of prompting, number of clusters, model sizes, +program qualities, etc., to provide useful insights that underpin CodeChain's +success. + +
+
+
+
+
+ + ♻ ☆ On the Road with GPT-4V(ision): Early Explorations of Visual-Language + Model on Autonomous Driving + + +
+ The pursuit of autonomous driving technology hinges on the sophisticated +integration of perception, decision-making, and control systems. Traditional +approaches, both data-driven and rule-based, have been hindered by their +inability to grasp the nuance of complex driving environments and the +intentions of other road users. This has been a significant bottleneck, +particularly in the development of common sense reasoning and nuanced scene +understanding necessary for safe and reliable autonomous driving. The advent of +Visual Language Models (VLM) represents a novel frontier in realizing fully +autonomous vehicle driving. This report provides an exhaustive evaluation of +the latest state-of-the-art VLM, GPT-4V(ision), and its application in +autonomous driving scenarios. We explore the model's abilities to understand +and reason about driving scenes, make decisions, and ultimately act in the +capacity of a driver. Our comprehensive tests span from basic scene recognition +to complex causal reasoning and real-time decision-making under varying +conditions. Our findings reveal that GPT-4V demonstrates superior performance +in scene understanding and causal reasoning compared to existing autonomous +systems. It showcases the potential to handle out-of-distribution scenarios, +recognize intentions, and make informed decisions in real driving contexts. +However, challenges remain, particularly in direction discernment, traffic +light recognition, vision grounding, and spatial reasoning tasks. These +limitations underscore the need for further research and development. Project +is now available on GitHub for interested parties to access and utilize: +\url{https://github.com/PJLab-ADG/GPT4V-AD-Exploration} + +
+
+
+
+
+ + ♻ ☆ A Multitask, Multilingual, Multimodal Evaluation of ChatGPT on + Reasoning, Hallucination, and Interactivity AACL 2023 + + +
+ This paper proposes a framework for quantitatively evaluating interactive +LLMs such as ChatGPT using publicly available data sets. We carry out an +extensive technical evaluation of ChatGPT using 23 data sets covering 8 +different common NLP application tasks. We evaluate the multitask, multilingual +and multi-modal aspects of ChatGPT based on these data sets and a newly +designed multimodal dataset. We find that ChatGPT outperforms LLMs with +zero-shot learning on most tasks and even outperforms fine-tuned models on some +tasks. We find that it is better at understanding non-Latin script languages +than generating them. It is able to generate multimodal content from textual +prompts, via an intermediate code generation step. Moreover, we find that +ChatGPT is 63.41% accurate on average in 10 different reasoning categories +under logical reasoning, non-textual reasoning, and commonsense reasoning, +hence making it an unreliable reasoner. It is, for example, better at deductive +than inductive reasoning. ChatGPT suffers from hallucination problems like +other LLMs and it generates more extrinsic hallucinations from its parametric +memory as it does not have access to an external knowledge base. Finally, the +interactive feature of ChatGPT enables human collaboration with the underlying +LLM to improve its performance, i.e, 8% ROUGE-1 on summarization and 2% ChrF++ +on machine translation, in a multi-turn "prompt engineering" fashion. We also +release codebase for evaluation set extraction. + +
+
+ comment: 45 pages, AACL 2023 +
+
+
+
+
+ + ♻ ☆ FELM: Benchmarking Factuality Evaluation of Large Language Models NeurIPS 2023 + + +
+ Assessing factuality of text generated by large language models (LLMs) is an +emerging yet crucial research area, aimed at alerting users to potential errors +and guiding the development of more reliable LLMs. Nonetheless, the evaluators +assessing factuality necessitate suitable evaluation themselves to gauge +progress and foster advancements. This direction remains under-explored, +resulting in substantial impediments to the progress of factuality evaluators. +To mitigate this issue, we introduce a benchmark for Factuality Evaluation of +large Language Models, referred to as felm. In this benchmark, we collect +responses generated from LLMs and annotate factuality labels in a fine-grained +manner. Contrary to previous studies that primarily concentrate on the +factuality of world knowledge (e.g.~information from Wikipedia), felm focuses +on factuality across diverse domains, spanning from world knowledge to math and +reasoning. Our annotation is based on text segments, which can help pinpoint +specific factual errors. The factuality annotations are further supplemented by +predefined error types and reference links that either support or contradict +the statement. In our experiments, we investigate the performance of several +LLM-based factuality evaluators on felm, including both vanilla LLMs and those +augmented with retrieval mechanisms and chain-of-thought processes. Our +findings reveal that while retrieval aids factuality evaluation, current LLMs +are far from satisfactory to faithfully detect factual errors. + +
+
+ comment: Accepted by NeurIPS 2023 Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Post-hoc Interpretability for Neural NLP: A Survey + + +
+ Neural networks for NLP are becoming increasingly complex and widespread, and +there is a growing concern if these models are responsible to use. Explaining +models helps to address the safety and ethical concerns and is essential for +accountability. Interpretability serves to provide these explanations in terms +that are understandable to humans. Additionally, post-hoc methods provide +explanations after a model is learned and are generally model-agnostic. This +survey provides a categorization of how recent post-hoc interpretability +methods communicate explanations to humans, it discusses each method in-depth, +and how they are validated, as the latter is often a common concern. + +
+
+
+
+
+ + ♻ ☆ Pre-training Language Models for Comparative Reasoning EMNLP 2023 + + +
+ Comparative reasoning is a process of comparing objects, concepts, or +entities to draw conclusions, which constitutes a fundamental cognitive +ability. In this paper, we propose a novel framework to pre-train language +models for enhancing their abilities of comparative reasoning over texts. While +there have been approaches for NLP tasks that require comparative reasoning, +they suffer from costly manual data labeling and limited generalizability to +different tasks. Our approach introduces a novel method of collecting scalable +data for text-based entity comparison, which leverages both structured and +unstructured data. Moreover, we present a framework of pre-training language +models via three novel objectives on comparative reasoning. Evaluation on +downstream tasks including comparative question answering, question generation, +and summarization shows that our pre-training framework significantly improves +the comparative reasoning abilities of language models, especially under +low-resource conditions. This work also releases the first integrated benchmark +for comparative reasoning. + +
+
+ comment: EMNLP 2023 - Camera Ready. Typos fixed +
+
+
+
+
+ + ♻ ☆ Using large language models to study human memory for meaningful + narratives + + +
+ One of the most impressive achievements of the AI revolution is the +development of large language models that can generate meaningful text and +respond to instructions in plain English with no additional training necessary. +Here we show that language models can be used as a scientific instrument for +studying human memory for meaningful material. We developed a pipeline for +designing large scale memory experiments and analyzing the obtained results. We +performed online memory experiments with a large number of participants and +collected recognition and recall data for narratives of different lengths. We +found that both recall and recognition performance scale linearly with +narrative length. Furthermore, in order to investigate the role of narrative +comprehension in memory, we repeated these experiments using scrambled versions +of the presented stories. We found that even though recall performance declined +significantly, recognition remained largely unaffected. Interestingly, recalls +in this condition seem to follow the original narrative order rather than the +scrambled presentation, pointing to a contextual reconstruction of the story in +memory. + +
+
+ comment: v2: 43 pages, with added discussion and a new appendix C +
+
+
+
+
+ + ♻ ☆ Breaking Language Barriers in Multilingual Mathematical Reasoning: + Insights and Observations + + +
+ Existing research predominantly focuses on developing powerful language +learning models (LLMs) for mathematical reasoning within monolingual languages, +with few explorations in preserving efficacy in a multilingual context. To +bridge this gap, this paper pioneers exploring and training powerful +Multilingual Math Reasoning (xMR) LLMs. Firstly, by utilizing translation, we +construct the first multilingual math reasoning instruction dataset, +MGSM8KInstruct, encompassing ten distinct languages, thus addressing the issue +of training data scarcity in xMR tasks. Based on the collected dataset, we +propose different training strategies to build powerful xMR LLMs, named +MathOctopus, notably outperform conventional open-source LLMs and exhibit +superiority over ChatGPT in few-shot scenarios. Notably, MathOctopus-13B +reaches 47.6% accuracy which exceeds ChatGPT 46.3% on MGSM testset. Beyond +remarkable results, we unearth several pivotal observations and insights from +extensive experiments: (1) When extending the rejection sampling strategy to +the multilingual context, it proves effective for model performances, albeit +limited. (2) Employing parallel corpora for math Supervised Fine-Tuning (SFT) +across multiple languages not only significantly enhances model performance +multilingually but also elevates their monolingual performance. This indicates +that crafting multilingual corpora can be regarded as a vital strategy for +enhancing model performance in a specific language, especially in mathematical +reasoning tasks. For instance, MathOctopus-7B improves its counterparts that +trained on English from 42.2% to 50.8% on GSM8K testset. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ♻ ☆ Phenomenal Yet Puzzling: Testing Inductive Reasoning Capabilities of + Language Models with Hypothesis Refinement + + +
+ The ability to derive underlying principles from a handful of observations +and then generalize to novel situations -- known as inductive reasoning -- is +central to human intelligence. Prior work suggests that language models (LMs) +often fall short on inductive reasoning, despite achieving impressive success +on research benchmarks. In this work, we conduct a systematic study of the +inductive reasoning capabilities of LMs through iterative hypothesis +refinement, a technique that more closely mirrors the human inductive process +than standard input-output prompting. Iterative hypothesis refinement employs a +three-step process: proposing, selecting, and refining hypotheses in the form +of textual rules. By examining the intermediate rules, we observe that LMs are +phenomenal hypothesis proposers (i.e., generating candidate rules), and when +coupled with a (task-specific) symbolic interpreter that is able to +systematically filter the proposed set of rules, this hybrid approach achieves +strong results across inductive reasoning benchmarks that require inducing +causal relations, language-like instructions, and symbolic concepts. However, +they also behave as puzzling inductive reasoners, showing notable performance +gaps between rule induction (i.e., identifying plausible rules) and rule +application (i.e., applying proposed rules to instances), suggesting that LMs +are proposing hypotheses without being able to actually apply the rules. +Through empirical and human analyses, we further reveal several discrepancies +between the inductive reasoning processes of LMs and humans, shedding light on +both the potentials and limitations of using LMs in inductive reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ On the Performance of Multimodal Language Models + + +
+ Instruction-tuned large language models (LLMs) have demonstrated promising +zero-shot generalization capabilities across various downstream tasks. Recent +research has introduced multimodal capabilities to LLMs by integrating +independently pretrained vision encoders through model grafting. These +multimodal variants undergo instruction tuning, similar to LLMs, enabling +effective zero-shot generalization for multimodal tasks. This study conducts a +comparative analysis of different multimodal instruction tuning approaches and +evaluates their performance across a range of tasks, including complex +reasoning, conversation, image captioning, multiple-choice questions (MCQs), +and binary classification. Through rigorous benchmarking and ablation +experiments, we reveal key insights for guiding architectural choices when +incorporating multimodal capabilities into LLMs. However, current approaches +have limitations; they do not sufficiently address the need for a diverse +multimodal instruction dataset, which is crucial for enhancing task +generalization. Additionally, they overlook issues related to truthfulness and +factuality when generating responses. These findings illuminate current +methodological constraints in adapting language models for image comprehension +and provide valuable guidance for researchers and practitioners seeking to +harness multimodal versions of LLMs. + +
+
+
+
+
+ + ♻ ☆ The effect of source disclosure on evaluation of AI-generated messages: + A two-part study + + +
+ Advancements in artificial intelligence (AI) over the last decade demonstrate +that machines can exhibit communicative behavior and influence how humans +think, feel, and behave. In fact, the recent development of ChatGPT has shown +that large language models (LLMs) can be leveraged to generate high-quality +communication content at scale and across domains, suggesting that they will be +increasingly used in practice. However, many questions remain about how knowing +the source of the messages influences recipients' evaluation of and preference +for AI-generated messages compared to human-generated messages. This paper +investigated this topic in the context of vaping prevention messaging. In Study +1, which was pre-registered, we examined the influence of source disclosure on +people's evaluation of AI-generated health prevention messages compared to +human-generated messages. We found that source disclosure (i.e., labeling the +source of a message as AI vs. human) significantly impacted the evaluation of +the messages but did not significantly alter message rankings. In a follow-up +study (Study 2), we examined how the influence of source disclosure may vary by +the participants' negative attitudes towards AI. We found a significant +moderating effect of negative attitudes towards AI on message evaluation, but +not for message selection. However, for those with moderate levels of negative +attitudes towards AI, source disclosure decreased the preference for +AI-generated messages. Overall, the results of this series of studies showed a +slight bias against AI-generated messages once the source was disclosed, adding +to the emerging area of study that lies at the intersection of AI and +communication. + +
+
+ comment: Manuscript currently under review. Paper presented at 109th Annual + National Communication Association (NCA) Conference, November 16-19, 2023. 10 + pages, 5 figures. Supplementary file formatting updated in current version +
+
+
+
+
+ + ♻ ☆ Certifying LLM Safety against Adversarial Prompting + + +
+ Large language models (LLMs) released for public use incorporate guardrails +to ensure their output is safe, often referred to as "model alignment." An +aligned language model should decline a user's request to produce harmful +content. However, such safety measures are vulnerable to adversarial attacks, +which add maliciously designed token sequences to a harmful prompt to bypass +the model's safety guards. In this work, we introduce erase-and-check, the +first framework to defend against adversarial prompts with verifiable safety +guarantees. We defend against three attack modes: i) adversarial suffix, which +appends an adversarial sequence at the end of the prompt; ii) adversarial +insertion, where the adversarial sequence is inserted anywhere in the middle of +the prompt; and iii) adversarial infusion, where adversarial tokens are +inserted at arbitrary positions in the prompt, not necessarily as a contiguous +block. Our experimental results demonstrate that this procedure can obtain +strong certified safety guarantees on harmful prompts while maintaining good +empirical performance on safe prompts. For example, against adversarial +suffixes of length 20, it certifiably detects 92% of harmful prompts and labels +94% of safe prompts correctly using the open-source language model Llama 2 as +the safety filter. We further improve the filter's performance, in terms of +accuracy and speed, by replacing Llama 2 with a DistilBERT safety classifier +fine-tuned on safe and harmful prompts. Additionally, we propose two efficient +empirical defenses: i) RandEC, a randomized version of erase-and-check that +evaluates the safety filter on a small subset of the erased subsequences, and +ii) GradEC, a gradient-based version that optimizes the erased tokens to remove +the adversarial sequence. The code for our experiments is available at +https://github.com/aounon/certified-llm-safety. + +
+
+
+
+
+ + ♻ ☆ Does Conceptual Representation Require Embodiment? Insights From Large + Language Models + + +
+ To what extent can language alone give rise to complex concepts, or is +embodied experience essential? Recent advancements in large language models +(LLMs) offer fresh perspectives on this question. Although LLMs are trained on +restricted modalities, they exhibit human-like performance in diverse +psychological tasks. Our study compared representations of 4,442 lexical +concepts between humans and ChatGPTs (GPT-3.5 and GPT-4) across multiple +dimensions, including five key domains: emotion, salience, mental +visualization, sensory, and motor experience. We identify two main findings: 1) +Both models strongly align with human representations in non-sensorimotor +domains but lag in sensory and motor areas, with GPT-4 outperforming GPT-3.5; +2) GPT-4's gains are associated with its additional visual learning, which also +appears to benefit related dimensions like haptics and imageability. These +results highlight the limitations of language in isolation, and that the +integration of diverse modalities of inputs leads to a more human-like +conceptual representation. + +
+
+
+
+
+ + ♻ ☆ A Baseline Analysis of Reward Models' Ability To Accurately Analyze + Foundation Models Under Distribution Shift + + +
+ Foundation models, specifically Large Language Models (LLM's), have lately +gained wide-spread attention and adoption. Reinforcement Learning with Human +Feedback (RLHF) involves training a reward model to capture desired behaviors, +which is then used to align an LLM. These reward models are additionally used +at inference-time to estimate how well LLM responses adhere to those desired +behaviors. However, there is little work measuring how robust these reward +models are to distribution shifts. In this work, we evaluate how reward model +performance - measured via accuracy and calibration (i.e. alignment between +accuracy and confidence) - is affected by distribution shift. We show novel +calibration patterns and accuracy drops due to OOD prompts and responses, and +that the reward model is more sensitive to shifts in responses than prompts. +Additionally, we adapt an OOD detection technique commonly used in +classification to the reward model setting in order to detect these +distribution shifts in prompts and responses. + +
+
+
+
+
+ + ♻ ☆ HallusionBench: An Advanced Diagnostic Suite for Entangled Language + Hallucination & Visual Illusion in Large Vision-Language Models + + +
+ We introduce HallusionBench, a comprehensive benchmark designed for the +evaluation of image-context reasoning. This benchmark presents significant +challenges to advanced large visual-language models (LVLMs), such as +GPT-4V(Vision) and LLaVA-1.5, by emphasizing nuanced understanding and +interpretation of visual data. The benchmark comprises 346 images paired with +1129 questions, all meticulously crafted by human experts. We introduce a novel +structure for these visual questions designed to establish control groups. This +structure enables us to conduct a quantitative analysis of the models' response +tendencies, logical consistency, and various failure modes. In our evaluation +on HallusionBench, we benchmarked 13 different models, highlighting a 31.42% +question-pair accuracy achieved by the state-of-the-art GPT-4V. Notably, all +other evaluated models achieve accuracy below 16%. Moreover, our analysis not +only highlights the observed failure modes, including language hallucination +and visual illusion, but also deepens an understanding of these pitfalls. Our +comprehensive case studies within HallusionBench shed light on the challenges +of hallucination and illusion in LVLMs. Based on these insights, we suggest +potential pathways for their future improvement. The benchmark and codebase can +be accessed at https://github.com/tianyi-lab/HallusionBench. + +
+
+
+
+
+ + ♻ ☆ Towards Understanding In-Context Learning with Contrastive + Demonstrations and Saliency Maps + + +
+ We investigate the role of various demonstration components in the in-context +learning (ICL) performance of large language models (LLMs). Specifically, we +explore the impacts of ground-truth labels, input distribution, and +complementary explanations, particularly when these are altered or perturbed. +We build on previous work, which offers mixed findings on how these elements +influence ICL. To probe these questions, we employ explainable NLP (XNLP) +methods and utilize saliency maps of contrastive demonstrations for both +qualitative and quantitative analysis. Our findings reveal that flipping +ground-truth labels significantly affects the saliency, though it's more +noticeable in larger LLMs. Our analysis of the input distribution at a granular +level reveals that changing sentiment-indicative terms in a sentiment analysis +task to neutral ones does not have as substantial an impact as altering +ground-truth labels. Finally, we find that the effectiveness of complementary +explanations in boosting ICL performance is task-dependent, with limited +benefits seen in sentiment analysis tasks compared to symbolic reasoning tasks. +These insights are critical for understanding the functionality of LLMs and +guiding the development of effective demonstrations, which is increasingly +relevant in light of the growing use of LLMs in applications such as ChatGPT. +Our research code is publicly available at https://github.com/paihengxu/XICL. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ On Separate Normalization in Self-supervised Transformers NIPS 2023 + + +
+ Self-supervised training methods for transformers have demonstrated +remarkable performance across various domains. Previous transformer-based +models, such as masked autoencoders (MAE), typically utilize a single +normalization layer for both the [CLS] symbol and the tokens. We propose in +this paper a simple modification that employs separate normalization layers for +the tokens and the [CLS] symbol to better capture their distinct +characteristics and enhance downstream task performance. Our method aims to +alleviate the potential negative effects of using the same normalization +statistics for both token types, which may not be optimally aligned with their +individual roles. We empirically show that by utilizing a separate +normalization layer, the [CLS] embeddings can better encode the global +contextual information and are distributed more uniformly in its anisotropic +space. When replacing the conventional normalization layer with the two +separate layers, we observe an average 2.7% performance improvement over the +image, natural language, and graph domains. + +
+
+ comment: NIPS 2023 +
+
+
+
+
+ + ♻ ☆ Explainability for Large Language Models: A Survey + + +
+ Large language models (LLMs) have demonstrated impressive capabilities in +natural language processing. However, their internal mechanisms are still +unclear and this lack of transparency poses unwanted risks for downstream +applications. Therefore, understanding and explaining these models is crucial +for elucidating their behaviors, limitations, and social impacts. In this +paper, we introduce a taxonomy of explainability techniques and provide a +structured overview of methods for explaining Transformer-based language +models. We categorize techniques based on the training paradigms of LLMs: +traditional fine-tuning-based paradigm and prompting-based paradigm. For each +paradigm, we summarize the goals and dominant approaches for generating local +explanations of individual predictions and global explanations of overall model +knowledge. We also discuss metrics for evaluating generated explanations, and +discuss how explanations can be leveraged to debug models and improve +performance. Lastly, we examine key challenges and emerging opportunities for +explanation techniques in the era of LLMs in comparison to conventional machine +learning models. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Material Palette: Extraction of Materials from a Single Image + + +
+ In this paper, we propose a method to extract physically-based rendering +(PBR) materials from a single real-world image. We do so in two steps: first, +we map regions of the image to material concepts using a diffusion model, which +allows the sampling of texture images resembling each material in the scene. +Second, we benefit from a separate network to decompose the generated textures +into Spatially Varying BRDFs (SVBRDFs), providing us with materials ready to be +used in rendering applications. Our approach builds on existing synthetic +material libraries with SVBRDF ground truth, but also exploits a +diffusion-generated RGB texture dataset to allow generalization to new samples +using unsupervised domain adaptation (UDA). Our contributions are thoroughly +evaluated on synthetic and real-world datasets. We further demonstrate the +applicability of our method for editing 3D scenes with materials estimated from +real photographs. The code and models will be made open-source. Project page: +https://astra-vision.github.io/MaterialPalette/ + +
+
+ comment: 8 pages, 11 figures, 2 tables. Webpage + https://astra-vision.github.io/MaterialPalette/ +
+
+
+
+
+ + ☆ HumanGaussian: Text-Driven 3D Human Generation with Gaussian Splatting + + +
+ Realistic 3D human generation from text prompts is a desirable yet +challenging task. Existing methods optimize 3D representations like mesh or +neural fields via score distillation sampling (SDS), which suffers from +inadequate fine details or excessive training time. In this paper, we propose +an efficient yet effective framework, HumanGaussian, that generates +high-quality 3D humans with fine-grained geometry and realistic appearance. Our +key insight is that 3D Gaussian Splatting is an efficient renderer with +periodic Gaussian shrinkage or growing, where such adaptive density control can +be naturally guided by intrinsic human structures. Specifically, 1) we first +propose a Structure-Aware SDS that simultaneously optimizes human appearance +and geometry. The multi-modal score function from both RGB and depth space is +leveraged to distill the Gaussian densification and pruning process. 2) +Moreover, we devise an Annealed Negative Prompt Guidance by decomposing SDS +into a noisier generative score and a cleaner classifier score, which well +addresses the over-saturation issue. The floating artifacts are further +eliminated based on Gaussian size in a prune-only phase to enhance generation +smoothness. Extensive experiments demonstrate the superior efficiency and +competitive quality of our framework, rendering vivid 3D humans under diverse +scenarios. Project Page: https://alvinliu0.github.io/projects/HumanGaussian + +
+
+ comment: Project Page: https://alvinliu0.github.io/projects/HumanGaussian +
+
+
+
+
+ + ☆ Panoptic Video Scene Graph Generation CVPR 2023 + + +
+ Towards building comprehensive real-world visual perception systems, we +propose and study a new problem called panoptic scene graph generation (PVSG). +PVSG relates to the existing video scene graph generation (VidSGG) problem, +which focuses on temporal interactions between humans and objects grounded with +bounding boxes in videos. However, the limitation of bounding boxes in +detecting non-rigid objects and backgrounds often causes VidSGG to miss key +details crucial for comprehensive video understanding. In contrast, PVSG +requires nodes in scene graphs to be grounded by more precise, pixel-level +segmentation masks, which facilitate holistic scene understanding. To advance +research in this new area, we contribute the PVSG dataset, which consists of +400 videos (289 third-person + 111 egocentric videos) with a total of 150K +frames labeled with panoptic segmentation masks as well as fine, temporal scene +graphs. We also provide a variety of baseline methods and share useful design +practices for future work. + +
+
+ comment: Accepted to CVPR 2023. Project Page: + https://jingkang50.github.io/PVSG/. Codebase: + https://github.com/LilyDaytoy/OpenPVSG. We provide 400 long videos with + frame-level panoptic segmentation, scene graph, dense captions, and QA + annotations +
+
+
+
+
+ + ☆ ReMoS: Reactive 3D Motion Synthesis for Two-Person Interactions + + +
+ Current approaches for 3D human motion synthesis can generate high-quality 3D +animations of digital humans performing a wide variety of actions and gestures. +However, there is still a notable technological gap in addressing the complex +dynamics of multi-human interactions within this paradigm. In this work, we +introduce ReMoS, a denoising diffusion-based probabilistic model for reactive +motion synthesis that explores two-person interactions. Given the motion of one +person, we synthesize the reactive motion of the second person to complete the +interactions between the two. In addition to synthesizing the full-body +motions, we also synthesize plausible hand interactions. We show the +performance of ReMoS under a wide range of challenging two-person scenarios +including pair-dancing, Ninjutsu, kickboxing, and acrobatics, where one +person's movements have complex and diverse influences on the motions of the +other. We further propose the ReMoCap dataset for two-person interactions +consisting of full-body and hand motions. We evaluate our approach through +multiple quantitative metrics, qualitative visualizations, and a user study. +Our results are usable in interactive applications while also providing an +adequate amount of control for animators. + +
+
+ comment: 13 pages, 8 figures, 3 tables +
+
+
+
+
+ + ☆ Self-Supervised Motion Magnification by Backpropagating Through Optical + Flow + + +
+ This paper presents a simple, self-supervised method for magnifying subtle +motions in video: given an input video and a magnification factor, we +manipulate the video such that its new optical flow is scaled by the desired +amount. To train our model, we propose a loss function that estimates the +optical flow of the generated video and penalizes how far if deviates from the +given magnification factor. Thus, training involves differentiating through a +pretrained optical flow network. Since our model is self-supervised, we can +further improve its performance through test-time adaptation, by finetuning it +on the input video. It can also be easily extended to magnify the motions of +only user-selected objects. Our approach avoids the need for synthetic +magnification datasets that have been used to train prior learning-based +approaches. Instead, it leverages the existing capabilities of off-the-shelf +motion estimators. We demonstrate the effectiveness of our method through +evaluations of both visual quality and quantitative metrics on a range of +real-world and synthetic videos, and we show our method works for both +supervised and unsupervised optical flow methods. + +
+
+
+
+
+ + ☆ Rethinking Directional Integration in Neural Radiance Fields + + +
+ Recent works use the Neural radiance field (NeRF) to perform multi-view 3D +reconstruction, providing a significant leap in rendering photorealistic +scenes. However, despite its efficacy, NeRF exhibits limited capability of +learning view-dependent effects compared to light field rendering or +image-based view synthesis. To that end, we introduce a modification to the +NeRF rendering equation which is as simple as a few lines of code change for +any NeRF variations, while greatly improving the rendering quality of +view-dependent effects. By swapping the integration operator and the direction +decoder network, we only integrate the positional features along the ray and +move the directional terms out of the integration, resulting in a +disentanglement of the view-dependent and independent components. The modified +equation is equivalent to the classical volumetric rendering in ideal cases on +object surfaces with Dirac densities. Furthermore, we prove that with the +errors caused by network approximation and numerical integration, our rendering +equation exhibits better convergence properties with lower error accumulations +compared to the classical NeRF. We also show that the modified equation can be +interpreted as light field rendering with learned ray embeddings. Experiments +on different NeRF variations show consistent improvements in the quality of +view-dependent effects with our simple modification. + +
+
+
+
+
+ + ☆ No Representation Rules Them All in Category Discovery NeurIPS 2023 + + +
+ In this paper we tackle the problem of Generalized Category Discovery (GCD). +Specifically, given a dataset with labelled and unlabelled images, the task is +to cluster all images in the unlabelled subset, whether or not they belong to +the labelled categories. Our first contribution is to recognize that most +existing GCD benchmarks only contain labels for a single clustering of the +data, making it difficult to ascertain whether models are using the available +labels to solve the GCD task, or simply solving an unsupervised clustering +problem. As such, we present a synthetic dataset, named 'Clevr-4', for category +discovery. Clevr-4 contains four equally valid partitions of the data, i.e +based on object shape, texture, color or count. To solve the task, models are +required to extrapolate the taxonomy specified by the labelled set, rather than +simply latching onto a single natural grouping of the data. We use this dataset +to demonstrate the limitations of unsupervised clustering in the GCD setting, +showing that even very strong unsupervised models fail on Clevr-4. We further +use Clevr-4 to examine the weaknesses of existing GCD algorithms, and propose a +new method which addresses these shortcomings, leveraging consistent findings +from the representation learning literature to do so. Our simple solution, +which is based on 'mean teachers' and termed $\mu$GCD, substantially +outperforms implemented baselines on Clevr-4. Finally, when we transfer these +findings to real data on the challenging Semantic Shift Benchmark (SSB), we +find that $\mu$GCD outperforms all prior work, setting a new state-of-the-art. +For the project webpage, see https://www.robots.ox.ac.uk/~vgg/data/clevr4/ + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ DiffuseBot: Breeding Soft Robots With Physics-Augmented Generative + Diffusion Models NeurIPS 2023 + + +
+ Nature evolves creatures with a high complexity of morphological and +behavioral intelligence, meanwhile computational methods lag in approaching +that diversity and efficacy. Co-optimization of artificial creatures' +morphology and control in silico shows promise for applications in physical +soft robotics and virtual character creation; such approaches, however, require +developing new learning algorithms that can reason about function atop pure +structure. In this paper, we present DiffuseBot, a physics-augmented diffusion +model that generates soft robot morphologies capable of excelling in a wide +spectrum of tasks. DiffuseBot bridges the gap between virtually generated +content and physical utility by (i) augmenting the diffusion process with a +physical dynamical simulation which provides a certificate of performance, and +(ii) introducing a co-design procedure that jointly optimizes physical design +and control by leveraging information about physical sensitivities from +differentiable simulation. We showcase a range of simulated and fabricated +robots along with their capabilities. Check our website at +https://diffusebot.github.io/ + +
+
+ comment: NeurIPS 2023. Project page: https://diffusebot.github.io/ +
+
+
+
+
+ + ☆ Surf-D: High-Quality Surface Generation for Arbitrary Topologies using + Diffusion Models + + +
+ In this paper, we present Surf-D, a novel method for generating high-quality +3D shapes as Surfaces with arbitrary topologies using Diffusion models. +Specifically, we adopt Unsigned Distance Field (UDF) as the surface +representation, as it excels in handling arbitrary topologies, enabling the +generation of complex shapes. While the prior methods explored shape generation +with different representations, they suffer from limited topologies and +geometry details. Moreover, it's non-trivial to directly extend prior diffusion +models to UDF because they lack spatial continuity due to the discrete volume +structure. However, UDF requires accurate gradients for mesh extraction and +learning. To tackle the issues, we first leverage a point-based auto-encoder to +learn a compact latent space, which supports gradient querying for any input +point through differentiation to effectively capture intricate geometry at a +high resolution. Since the learning difficulty for various shapes can differ, a +curriculum learning strategy is employed to efficiently embed various surfaces, +enhancing the whole embedding process. With pretrained shape latent space, we +employ a latent diffusion model to acquire the distribution of various shapes. +Our approach demonstrates superior performance in shape generation across +multiple modalities and conducts extensive experiments in unconditional +generation, category conditional generation, 3D reconstruction from images, and +text-to-shape tasks. + +
+
+ comment: Project Page: https://yzmblog.github.io/projects/SurfD/ +
+
+
+
+
+ + ☆ MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced + Training + + +
+ Contrastive pretraining of image-text foundation models, such as CLIP, +demonstrated excellent zero-shot performance and improved robustness on a wide +range of downstream tasks. However, these models utilize large +transformer-based encoders with significant memory and latency overhead which +pose challenges for deployment on mobile devices. In this work, we introduce +MobileCLIP -- a new family of efficient image-text models optimized for runtime +performance along with a novel and efficient training approach, namely +multi-modal reinforced training. The proposed training approach leverages +knowledge transfer from an image captioning model and an ensemble of strong +CLIP encoders to improve the accuracy of efficient models. Our approach avoids +train-time compute overhead by storing the additional knowledge in a reinforced +dataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for +zero-shot classification and retrieval tasks on several datasets. Our +MobileCLIP-S2 variant is 2.3$\times$ faster while more accurate compared to +previous best CLIP model based on ViT-B/16. We further demonstrate the +effectiveness of our multi-modal reinforced training by training a CLIP model +based on ViT-B/16 image backbone and achieving +2.9% average performance +improvement on 38 evaluation benchmarks compared to the previous best. +Moreover, we show that the proposed approach achieves 10$\times$-1000$\times$ +improved learning efficiency when compared with non-reinforced CLIP training. + +
+
+
+
+
+ + ☆ Zero-shot Referring Expression Comprehension via Structural Similarity + Between Images and Captions + + +
+ Zero-shot referring expression comprehension aims at localizing bounding +boxes in an image corresponding to the provided textual prompts, which +requires: (i) a fine-grained disentanglement of complex visual scene and +textual context, and (ii) a capacity to understand relationships among +disentangled entities. Unfortunately, existing large vision-language alignment +(VLA) models, e.g., CLIP, struggle with both aspects so cannot be directly used +for this task. To mitigate this gap, we leverage large foundation models to +disentangle both images and texts into triplets in the format of (subject, +predicate, object). After that, grounding is accomplished by calculating the +structural similarity matrix between visual and textual triplets with a VLA +model, and subsequently propagate it to an instance-level similarity matrix. +Furthermore, to equip VLA models with the ability of relationship +understanding, we design a triplet-matching objective to fine-tune the VLA +models on a collection of curated dataset containing abundant entity +relationships. Experiments demonstrate that our visual grounding performance +increase of up to 19.5% over the SOTA zero-shot model on RefCOCO/+/g. On the +more challenging Who's Waldo dataset, our zero-shot approach achieves +comparable accuracy to the fully supervised model. + +
+
+
+
+
+ + ☆ LLaMA-VID: An Image is Worth 2 Tokens in Large Language Models + + +
+ In this work, we present a novel method to tackle the token generation +challenge in Vision Language Models (VLMs) for video and image understanding, +called LLaMA-VID. Current VLMs, while proficient in tasks like image captioning +and visual question answering, face computational burdens when processing long +videos due to the excessive visual tokens. LLaMA-VID addresses this issue by +representing each frame with two distinct tokens, namely context token and +content token. The context token encodes the overall image context based on +user input, whereas the content token encapsulates visual cues in each frame. +This dual-token strategy significantly reduces the overload of long videos +while preserving critical information. Generally, LLaMA-VID empowers existing +frameworks to support hour-long videos and pushes their upper limit with an +extra context token. It is proved to surpass previous methods on most of video- +or image-based benchmarks. Code is available +https://github.com/dvlab-research/LLaMA-VID}{https://github.com/dvlab-research/LLaMA-VID + +
+
+ comment: Code is available at https://github.com/dvlab-research/LLaMA-VID +
+
+
+
+
+ + ☆ Adversarial Diffusion Distillation + + +
+ We introduce Adversarial Diffusion Distillation (ADD), a novel training +approach that efficiently samples large-scale foundational image diffusion +models in just 1-4 steps while maintaining high image quality. We use score +distillation to leverage large-scale off-the-shelf image diffusion models as a +teacher signal in combination with an adversarial loss to ensure high image +fidelity even in the low-step regime of one or two sampling steps. Our analyses +show that our model clearly outperforms existing few-step methods (GANs, Latent +Consistency Models) in a single step and reaches the performance of +state-of-the-art diffusion models (SDXL) in only four steps. ADD is the first +method to unlock single-step, real-time image synthesis with foundation models. +Code and weights available under +https://github.com/Stability-AI/generative-models and +https://huggingface.co/stabilityai/ . + +
+
+
+
+
+ + ☆ Efficient In-Context Learning in Vision-Language Models for Egocentric + Videos + + +
+ Recent advancements in text-only large language models (LLMs) have +highlighted the benefit of in-context learning for adapting to new tasks with a +few demonstrations. However, extending in-context learning to large +vision-language models (VLMs) using a huge amount of naturalistic +vision-language data has shown limited success, particularly for egocentric +videos, due to high data collection costs. We propose a novel training method +$\mathbb{E}$fficient $\mathbb{I}$n-context $\mathbb{L}$earning on +$\mathbb{E}$gocentric $\mathbb{V}$ideos ($\mathbb{EILEV}$), which elicits +in-context learning in VLMs for egocentric videos without requiring massive, +naturalistic egocentric video datasets. $\mathbb{EILEV}$ involves architectural +and training data adaptations to allow the model to process contexts +interleaved with video clips and narrations, sampling of in-context examples +with clusters of similar verbs and nouns, use of data with skewed marginal +distributions with a long tail of infrequent verbs and nouns, as well as +homonyms and synonyms. Our evaluations show that $\mathbb{EILEV}$-trained +models outperform larger VLMs trained on a huge amount of naturalistic data in +in-context learning. Furthermore, they can generalize to not only +out-of-distribution, but also novel, rare egocentric videos and texts via +in-context learning, demonstrating potential for applications requiring +cost-effective training, and rapid post-deployment adaptability. Our code and +demo are available at \url{https://github.com/yukw777/EILEV}. + +
+
+
+
+
+ + ☆ Telling Left from Right: Identifying Geometry-Aware Semantic + Correspondence + + +
+ While pre-trained large-scale vision models have shown significant promise +for semantic correspondence, their features often struggle to grasp the +geometry and orientation of instances. This paper identifies the importance of +being geometry-aware for semantic correspondence and reveals a limitation of +the features of current foundation models under simple post-processing. We show +that incorporating this information can markedly enhance semantic +correspondence performance with simple but effective solutions in both +zero-shot and supervised settings. We also construct a new challenging +benchmark for semantic correspondence built from an existing animal pose +estimation dataset, for both pre-training validating models. Our method +achieves a PCK@0.10 score of 64.2 (zero-shot) and 85.6 (supervised) on the +challenging SPair-71k dataset, outperforming the state-of-the-art by 4.3p and +11.0p absolute gains, respectively. Our code and datasets will be publicly +available. + +
+
+ comment: Project page: https://telling-left-from-right.github.io/ +
+
+
+
+
+ + ☆ When the Few Outweigh the Many: Illicit Content Recognition with + Few-Shot Learning + + +
+ The anonymity and untraceability benefits of the Dark web account for the +exponentially-increased potential of its popularity while creating a suitable +womb for many illicit activities, to date. Hence, in collaboration with +cybersecurity and law enforcement agencies, research has provided approaches +for recognizing and classifying illicit activities with most exploiting textual +dark web markets' content recognition; few such approaches use images that +originated from dark web content. This paper investigates this alternative +technique for recognizing illegal activities from images. In particular, we +investigate label-agnostic learning techniques like One-Shot and Few-Shot +learning featuring the use Siamese neural networks, a state-of-the-art approach +in the field. Our solution manages to handle small-scale datasets with +promising accuracy. In particular, Siamese neural networks reach 90.9% on +20-Shot experiments over a 10-class dataset; this leads us to conclude that +such models are a promising and cheaper alternative to the definition of +automated law-enforcing machinery over the dark web. + +
+
+
+
+
+ + ☆ Diffusion 3D Features (Diff3F): Decorating Untextured Shapes with + Distilled Semantic Features + + +
+ We present Diff3F as a simple, robust, and class-agnostic feature descriptor +that can be computed for untextured input shapes (meshes or point clouds). Our +method distills diffusion features from image foundational models onto input +shapes. Specifically, we use the input shapes to produce depth and normal maps +as guidance for conditional image synthesis, and in the process produce +(diffusion) features in 2D that we subsequently lift and aggregate on the +original surface. Our key observation is that even if the conditional image +generations obtained from multi-view rendering of the input shapes are +inconsistent, the associated image features are robust and can be directly +aggregated across views. This produces semantic features on the input shapes, +without requiring additional data or training. We perform extensive experiments +on multiple benchmarks (SHREC'19, SHREC'20, and TOSCA) and demonstrate that our +features, being semantic instead of geometric, produce reliable correspondence +across both isometeric and non-isometrically related shape families. + +
+
+
+
+
+ + ☆ Space-Time Diffusion Features for Zero-Shot Text-Driven Motion Transfer + + +
+ We present a new method for text-driven motion transfer - synthesizing a +video that complies with an input text prompt describing the target objects and +scene while maintaining an input video's motion and scene layout. Prior methods +are confined to transferring motion across two subjects within the same or +closely related object categories and are applicable for limited domains (e.g., +humans). In this work, we consider a significantly more challenging setting in +which the target and source objects differ drastically in shape and +fine-grained motion characteristics (e.g., translating a jumping dog into a +dolphin). To this end, we leverage a pre-trained and fixed text-to-video +diffusion model, which provides us with generative and motion priors. The +pillar of our method is a new space-time feature loss derived directly from the +model. This loss guides the generation process to preserve the overall motion +of the input video while complying with the target object in terms of shape and +fine-grained motion traits. + +
+
+ comment: Project page: https://diffusion-motion-transfer.github.io/ +
+
+
+
+
+ + ☆ MVBench: A Comprehensive Multi-modal Video Understanding Benchmark + + +
+ With the rapid development of Multi-modal Large Language Models (MLLMs), a +number of diagnostic benchmarks have recently emerged to evaluate the +comprehension capabilities of these models. However, most benchmarks +predominantly assess spatial understanding in the static image tasks, while +overlooking temporal understanding in the dynamic video tasks. To alleviate +this issue, we introduce a comprehensive Multi-modal Video understanding +Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot +be effectively solved with a single frame. Specifically, we first introduce a +novel static-to-dynamic method to define these temporal-related tasks. By +transforming various static tasks into dynamic ones, we enable the systematic +generation of video tasks that require a broad spectrum of temporal skills, +ranging from perception to cognition. Then, guided by the task definition, we +automatically convert public video annotations into multiple-choice QA to +evaluate each task. On one hand, such a distinct paradigm allows us to build +MVBench efficiently, without much manual intervention. On the other hand, it +guarantees evaluation fairness with ground-truth video annotations, avoiding +the biased scoring of LLMs. Moreover, we further develop a robust video MLLM +baseline, i.e., VideoChat2, by progressive multi-modal training with diverse +instruction-tuning data. The extensive results on our MVBench reveal that, the +existing MLLMs are far from satisfactory in temporal understanding, while our +VideoChat2 largely surpasses these leading models by over 15% on MVBench. All +models and data are available at https://github.com/OpenGVLab/Ask-Anything. + +
+
+ comment: 18 pages, 7 figures, 19 tables +
+
+
+
+
+ + ☆ Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following + + +
+ Existing text-to-image (T2I) diffusion models usually struggle in +interpreting complex prompts, especially those with quantity, object-attribute +binding, and multi-subject descriptions. In this work, we introduce a semantic +panel as the middleware in decoding texts to images, supporting the generator +to better follow instructions. The panel is obtained through arranging the +visual concepts parsed from the input text by the aid of large language models, +and then injected into the denoising network as a detailed control signal to +complement the text condition. To facilitate text-to-panel learning, we come up +with a carefully designed semantic formatting protocol, accompanied by a +fully-automatic data preparation pipeline. Thanks to such a design, our +approach, which we call Ranni, manages to enhance a pre-trained T2I generator +regarding its textual controllability. More importantly, the introduction of +the generative middleware brings a more convenient form of interaction (i.e., +directly adjusting the elements in the panel or using language instructions) +and further allows users to finely customize their generation, based on which +we develop a practical system and showcase its potential in continuous +generation and chatting-based editing. + +
+
+
+
+
+ + ☆ COLE: A Hierarchical Generation Framework for Graphic Design + + +
+ Graphic design, which has been evolving since the 15th century, plays a +crucial role in advertising. The creation of high-quality designs demands +creativity, innovation, and lateral thinking. This intricate task involves +understanding the objective, crafting visual elements such as the background, +decoration, font, color, and shape, formulating diverse professional layouts, +and adhering to fundamental visual design principles. In this paper, we +introduce COLE, a hierarchical generation framework designed to comprehensively +address these challenges. This COLE system can transform a straightforward +intention prompt into a high-quality graphic design, while also supporting +flexible editing based on user input. Examples of such input might include +directives like ``design a poster for Hisaishi's concert.'' The key insight is +to dissect the complex task of text-to-design generation into a hierarchy of +simpler sub-tasks, each addressed by specialized models working +collaboratively. The results from these models are then consolidated to produce +a cohesive final output. Our hierarchical task decomposition can streamline the +complex process and significantly enhance generation reliability. Our COLE +system consists of multiple fine-tuned Large Language Models (LLMs), Large +Multimodal Models (LMMs), and Diffusion Models (DMs), each specifically +tailored for a design-aware text or image generation task. Furthermore, we +construct the DESIGNERINTENTION benchmark to highlight the superiority of our +COLE over existing methods in generating high-quality graphic designs from user +intent. We perceive our COLE as an important step towards addressing more +complex visual design generation tasks in the future. + +
+
+ comment: Technical report. Project page: + https://graphic-design-generation.github.io/ +
+
+
+
+
+ + ☆ HumanRef: Single Image to 3D Human Generation via Reference-Guided + Diffusion + + +
+ Generating a 3D human model from a single reference image is challenging +because it requires inferring textures and geometries in invisible views while +maintaining consistency with the reference image. Previous methods utilizing 3D +generative models are limited by the availability of 3D training data. +Optimization-based methods that lift text-to-image diffusion models to 3D +generation often fail to preserve the texture details of the reference image, +resulting in inconsistent appearances in different views. In this paper, we +propose HumanRef, a 3D human generation framework from a single-view input. To +ensure the generated 3D model is photorealistic and consistent with the input +image, HumanRef introduces a novel method called reference-guided score +distillation sampling (Ref-SDS), which effectively incorporates image guidance +into the generation process. Furthermore, we introduce region-aware attention +to Ref-SDS, ensuring accurate correspondence between different body regions. +Experimental results demonstrate that HumanRef outperforms state-of-the-art +methods in generating 3D clothed humans with fine geometry, photorealistic +textures, and view-consistent appearances. + +
+
+ comment: Homepage: https://eckertzhang.github.io/HumanRef.github.io/ +
+
+
+
+
+ + ☆ UC-NeRF: Neural Radiance Field for Under-Calibrated multi-view cameras + in autonomous driving + + +
+ Multi-camera setups find widespread use across various applications, such as +autonomous driving, as they greatly expand sensing capabilities. Despite the +fast development of Neural radiance field (NeRF) techniques and their wide +applications in both indoor and outdoor scenes, applying NeRF to multi-camera +systems remains very challenging. This is primarily due to the inherent +under-calibration issues in multi-camera setup, including inconsistent imaging +effects stemming from separately calibrated image signal processing units in +diverse cameras, and system errors arising from mechanical vibrations during +driving that affect relative camera poses. In this paper, we present UC-NeRF, a +novel method tailored for novel view synthesis in under-calibrated multi-view +camera systems. Firstly, we propose a layer-based color correction to rectify +the color inconsistency in different image regions. Second, we propose virtual +warping to generate more viewpoint-diverse but color-consistent virtual views +for color correction and 3D recovery. Finally, a spatiotemporally constrained +pose refinement is designed for more robust and accurate pose calibration in +multi-camera systems. Our method not only achieves state-of-the-art performance +of novel view synthesis in multi-camera setups, but also effectively +facilitates depth estimation in large-scale outdoor scenes with the synthesized +novel views. + +
+
+ comment: See the project page for code, data: + https://kcheng1021.github.io/ucnerf.github.io +
+
+
+
+
+ + ☆ Image segmentation with traveling waves in an exactly solvable recurrent + neural network + + +
+ We study image segmentation using spatiotemporal dynamics in a recurrent +neural network where the state of each unit is given by a complex number. We +show that this network generates sophisticated spatiotemporal dynamics that can +effectively divide an image into groups according to a scene's structural +characteristics. Using an exact solution of the recurrent network's dynamics, +we present a precise description of the mechanism underlying object +segmentation in this network, providing a clear mathematical interpretation of +how the network performs this task. We then demonstrate a simple algorithm for +object segmentation that generalizes across inputs ranging from simple +geometric objects in grayscale images to natural images. Object segmentation +across all images is accomplished with one recurrent neural network that has a +single, fixed set of weights. This demonstrates the expressive potential of +recurrent neural networks when constructed using a mathematical approach that +brings together their structure, dynamics, and computation. + +
+
+
+
+
+ + ☆ Debiasing Multimodal Models via Causal Information Minimization EMNLP 2023 + + +
+ Most existing debiasing methods for multimodal models, including causal +intervention and inference methods, utilize approximate heuristics to represent +the biases, such as shallow features from early stages of training or unimodal +features for multimodal tasks like VQA, etc., which may not be accurate. In +this paper, we study bias arising from confounders in a causal graph for +multimodal data and examine a novel approach that leverages causally-motivated +information minimization to learn the confounder representations. Robust +predictive features contain diverse information that helps a model generalize +to out-of-distribution data. Hence, minimizing the information content of +features obtained from a pretrained biased model helps learn the simplest +predictive features that capture the underlying data distribution. We treat +these features as confounder representations and use them via methods motivated +by causal theory to remove bias from models. We find that the learned +confounder representations indeed capture dataset biases, and the proposed +debiasing methods improve out-of-distribution (OOD) performance on multiple +multimodal datasets without sacrificing in-distribution performance. +Additionally, we introduce a novel metric to quantify the sufficiency of +spurious features in models' predictions that further demonstrates the +effectiveness of our proposed methods. Our code is available at: +https://github.com/Vaidehi99/CausalInfoMin + +
+
+ comment: EMNLP 2023 Findings (16 pages) +
+
+
+
+
+ + ☆ The Sky's the Limit: Re-lightable Outdoor Scenes via a Sky-pixel + Constrained Illumination Prior and Outside-In Visibility + + +
+ Inverse rendering of outdoor scenes from unconstrained image collections is a +challenging task, particularly illumination/albedo ambiguities and occlusion of +the illumination environment (shadowing) caused by geometry. However, there are +many cues in an image that can aid in the disentanglement of geometry, albedo +and shadows. We exploit the fact that any sky pixel provides a direct +measurement of distant lighting in the corresponding direction and, via a +neural illumination prior, a statistical cue as to the remaining illumination +environment. We also introduce a novel `outside-in' method for computing +differentiable sky visibility based on a neural directional distance function. +This is efficient and can be trained in parallel with the neural scene +representation, allowing gradients from appearance loss to flow from shadows to +influence estimation of illumination and geometry. Our method estimates +high-quality albedo, geometry, illumination and sky visibility, achieving +state-of-the-art results on the NeRF-OSR relighting benchmark. Our code and +models can be found https://github.com/JADGardner/neusky + +
+
+
+
+
+ + ☆ SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models + + +
+ The development of text-to-video (T2V), i.e., generating videos with a given +text prompt, has been significantly advanced in recent years. However, relying +solely on text prompts often results in ambiguous frame composition due to +spatial uncertainty. The research community thus leverages the dense structure +signals, e.g., per-frame depth/edge sequences, to enhance controllability, +whose collection accordingly increases the burden of inference. In this work, +we present SparseCtrl to enable flexible structure control with temporally +sparse signals, requiring only one or a few inputs, as shown in Figure 1. It +incorporates an additional condition encoder to process these sparse signals +while leaving the pre-trained T2V model untouched. The proposed approach is +compatible with various modalities, including sketches, depth maps, and RGB +images, providing more practical control for video generation and promoting +applications such as storyboarding, depth rendering, keyframe animation, and +interpolation. Extensive experiments demonstrate the generalization of +SparseCtrl on both original and personalized T2V generators. Codes and models +will be publicly available at https://guoyww.github.io/projects/SparseCtrl . + +
+
+ comment: Project page: https://guoyww.github.io/projects/SparseCtrl +
+
+
+
+
+ + ☆ LLaFS: When Large-Language Models Meet Few-Shot Segmentation + + +
+ This paper proposes LLaFS, the first attempt to leverage large language +models (LLMs) in few-shot segmentation. In contrast to the conventional +few-shot segmentation methods that only rely on the limited and biased +information from the annotated support images, LLaFS leverages the vast prior +knowledge gained by LLM as an effective supplement and directly uses the LLM to +segment images in a few-shot manner. To enable the text-based LLM to handle +image-related tasks, we carefully design an input instruction that allows the +LLM to produce segmentation results represented as polygons, and propose a +region-attribute table to simulate the human visual mechanism and provide +multi-modal guidance. We also synthesize pseudo samples and use curriculum +learning for pretraining to augment data and achieve better optimization. LLaFS +achieves state-of-the-art results on multiple datasets, showing the potential +of using LLMs for few-shot computer vision tasks. Code will be available at +https://github.com/lanyunzhu99/LLaFS. + +
+
+
+
+
+ + ☆ Super-Resolution through StyleGAN Regularized Latent Search: A + Realism-Fidelity Trade-off + + +
+ This paper addresses the problem of super-resolution: constructing a highly +resolved (HR) image from a low resolved (LR) one. Recent unsupervised +approaches search the latent space of a StyleGAN pre-trained on HR images, for +the image that best downscales to the input LR image. However, they tend to +produce out-of-domain images and fail to accurately reconstruct HR images that +are far from the original domain. Our contribution is twofold. Firstly, we +introduce a new regularizer to constrain the search in the latent space, +ensuring that the inverted code lies in the original image manifold. Secondly, +we further enhanced the reconstruction through expanding the image prior around +the optimal latent code. Our results show that the proposed approach recovers +realistic high-quality images for large magnification factors. Furthermore, for +low magnification factors, it can still reconstruct details that the generator +could not have produced otherwise. Altogether, our approach achieves a good +trade-off between fidelity and realism for the super-resolution task. + +
+
+
+
+
+ + ☆ Mitigating Object Hallucinations in Large Vision-Language Models through + Visual Contrastive Decoding + + +
+ Large Vision-Language Models (LVLMs) have advanced considerably, intertwining +visual recognition and language understanding to generate content that is not +only coherent but also contextually attuned. Despite their success, LVLMs still +suffer from the issue of object hallucinations, where models generate plausible +yet incorrect outputs that include objects that do not exist in the images. To +mitigate this issue, we introduce Visual Contrastive Decoding (VCD), a simple +and training-free method that contrasts output distributions derived from +original and distorted visual inputs. The proposed VCD effectively reduces the +over-reliance on statistical bias and unimodal priors, two essential causes of +object hallucinations. This adjustment ensures the generated content is closely +grounded to visual inputs, resulting in contextually accurate outputs. Our +experiments show that VCD, without either additional training or the usage of +external tools, significantly mitigates the object hallucination issue across +different LVLM families. Beyond mitigating object hallucinations, VCD also +excels in general LVLM benchmarks, highlighting its wide-ranging applicability. + +
+
+
+
+
+ + ☆ RichDreamer: A Generalizable Normal-Depth Diffusion Model for Detail + Richness in Text-to-3D + + +
+ Lifting 2D diffusion for 3D generation is a challenging problem due to the +lack of geometric prior and the complex entanglement of materials and lighting +in natural images. Existing methods have shown promise by first creating the +geometry through score-distillation sampling (SDS) applied to rendered surface +normals, followed by appearance modeling. However, relying on a 2D RGB +diffusion model to optimize surface normals is suboptimal due to the +distribution discrepancy between natural images and normals maps, leading to +instability in optimization. In this paper, recognizing that the normal and +depth information effectively describe scene geometry and be automatically +estimated from images, we propose to learn a generalizable Normal-Depth +diffusion model for 3D generation. We achieve this by training on the +large-scale LAION dataset together with the generalizable image-to-depth and +normal prior models. In an attempt to alleviate the mixed illumination effects +in the generated materials, we introduce an albedo diffusion model to impose +data-driven constraints on the albedo component. Our experiments show that when +integrated into existing text-to-3D pipelines, our models significantly enhance +the detail richness, achieving state-of-the-art results. Our project page is +https://lingtengqiu.github.io/RichDreamer/. + +
+
+ comment: Project Page: https://lingtengqiu.github.io/RichDreamer/ +
+
+
+
+
+ + ☆ UGG: Unified Generative Grasping + + +
+ Dexterous grasping aims to produce diverse grasping postures with a high +grasping success rate. Regression-based methods that directly predict grasping +parameters given the object may achieve a high success rate but often lack +diversity. Generation-based methods that generate grasping postures conditioned +on the object can often produce diverse grasping, but they are insufficient for +high grasping success due to lack of discriminative information. To mitigate, +we introduce a unified diffusion-based dexterous grasp generation model, dubbed +the name UGG, which operates within the object point cloud and hand parameter +spaces. Our all-transformer architecture unifies the information from the +object, the hand, and the contacts, introducing a novel representation of +contact points for improved contact modeling. The flexibility and quality of +our model enable the integration of a lightweight discriminator, benefiting +from simulated discriminative data, which pushes for a high success rate while +preserving high diversity. Beyond grasp generation, our model can also generate +objects based on hand information, offering valuable insights into object +design and studying how the generative model perceives objects. Our model +achieves state-of-the-art dexterous grasping on the large-scale DexGraspNet +dataset while facilitating human-centric object design, marking a significant +advancement in dexterous grasping research. Our project page is +https://jiaxin-lu.github.io/ugg/ . + +
+
+ comment: 17 pages, 14 figures +
+
+
+
+
+ + ☆ Brain-ID: Learning Robust Feature Representations for Brain Imaging + + +
+ Recent learning-based approaches have made astonishing advances in calibrated +medical imaging like computerized tomography, yet they struggle to generalize +in uncalibrated modalities -- notoriously magnetic resonance imaging (MRI), +where performance is highly sensitive to the differences in MR contrast, +resolution, and orientation between the training and testing data. This +prevents broad applicability to the diverse clinical acquisition protocols in +the real world. We introduce Brain-ID, a robust feature representation learning +strategy for brain imaging, which is contrast-agnostic, and robust to the brain +anatomy of each subject regardless of the appearance of acquired images (i.e., +deformation, contrast, resolution, orientation, artifacts, etc). Brain-ID is +trained entirely on synthetic data, and easily adapts to downstream tasks with +our proposed simple one-layer solution. We validate the robustness of Brain-ID +features, and evaluate their performance in a variety of downstream +applications, including both contrast-independent (anatomy +reconstruction/contrast synthesis, brain segmentation), and contrast-dependent +(super-resolution, bias field estimation) tasks. Extensive experiments on 6 +public datasets demonstrate that Brain-ID achieves state-of-the-art performance +in all tasks, and more importantly, preserves its performance when only limited +training data is available. + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ Lane-Keeping Control of Autonomous Vehicles Through a Soft-Constrained + Iterative LQR + + +
+ The accurate prediction of smooth steering inputs is crucial for autonomous +vehicle applications because control actions with jitter might cause the +vehicle system to become unstable. To address this problem in automobile +lane-keeping control without the use of additional smoothing algorithms, we +developed a soft-constrained iterative linear-quadratic regulator (soft-CILQR) +algorithm by integrating CILQR algorithm and a model predictive control (MPC) +constraint relaxation method. We incorporated slack variables into the state +and control barrier functions of the soft-CILQR solver to soften the +constraints in the optimization process so that stabilizing control inputs can +be calculated in a relatively simple manner. Two types of automotive +lane-keeping experiments were conducted with a linear system dynamics model to +test the performance of the proposed soft-CILQR algorithm and to compare its +performance with that of the CILQR algorithm: numerical simulations and +experiments involving challenging vision-based maneuvers. In the numerical +simulations, the soft-CILQR and CILQR solvers managed to drive the system +toward the reference state asymptotically; however, the soft-CILQR solver +obtained smooth steering input trajectories more easily than did the CILQR +solver under conditions involving additive disturbances. In the experiments +with visual inputs, the soft-CILQR controller outperformed the CILQR controller +in terms of tracking accuracy and steering smoothness during the driving of an +ego vehicle on TORCS. + +
+
+ comment: 11 figures, 10 pages +
+
+
+
+
+ + ☆ Dendrogram distance: an evaluation metric for generative networks using + hierarchical clustering + + +
+ We present a novel metric for generative modeling evaluation, focusing +primarily on generative networks. The method uses dendrograms to represent real +and fake data, allowing for the divergence between training and generated +samples to be computed. This metric focus on mode collapse, targeting +generators that are not able to capture all modes in the training set. To +evaluate the proposed method it is introduced a validation scheme based on +sampling from real datasets, therefore the metric is evaluated in a controlled +environment and proves to be competitive with other state-of-the-art +approaches. + +
+
+
+
+
+ + ☆ Optimisation-Based Multi-Modal Semantic Image Editing + + +
+ Image editing affords increased control over the aesthetics and content of +generated images. Pre-existing works focus predominantly on text-based +instructions to achieve desired image modifications, which limit edit precision +and accuracy. In this work, we propose an inference-time editing optimisation, +designed to extend beyond textual edits to accommodate multiple editing +instruction types (e.g. spatial layout-based; pose, scribbles, edge maps). We +propose to disentangle the editing task into two competing subtasks: successful +local image modifications and global content consistency preservation, where +subtasks are guided through two dedicated loss functions. By allowing to adjust +the influence of each loss function, we build a flexible editing solution that +can be adjusted to user preferences. We evaluate our method using text, pose +and scribble edit conditions, and highlight our ability to achieve complex +edits, through both qualitative and quantitative experiments. + +
+
+
+
+
+ + ☆ A Unified Approach for Text- and Image-guided 4D Scene Generation + + +
+ Large-scale diffusion generative models are greatly simplifying image, video +and 3D asset creation from user-provided text prompts and images. However, the +challenging problem of text-to-4D dynamic 3D scene generation with diffusion +guidance remains largely unexplored. We propose Dream-in-4D, which features a +novel two-stage approach for text-to-4D synthesis, leveraging (1) 3D and 2D +diffusion guidance to effectively learn a high-quality static 3D asset in the +first stage; (2) a deformable neural radiance field that explicitly +disentangles the learned static asset from its deformation, preserving quality +during motion learning; and (3) a multi-resolution feature grid for the +deformation field with a displacement total variation loss to effectively learn +motion with video diffusion guidance in the second stage. Through a user +preference study, we demonstrate that our approach significantly advances image +and motion quality, 3D consistency and text fidelity for text-to-4D generation +compared to baseline approaches. Thanks to its motion-disentangled +representation, Dream-in-4D can also be easily adapted for controllable +generation where appearance is defined by one or multiple images, without the +need to modify the motion learning stage. Thus, our method offers, for the +first time, a unified approach for text-to-4D, image-to-4D and personalized 4D +generation tasks. + +
+
+ comment: Project page: https://dream-in-4d.github.io/dream-in-4D/ +
+
+
+
+
+ + ☆ Wavelet-based Fourier Information Interaction with Frequency Diffusion + Adjustment for Underwater Image Restoration + + +
+ Underwater images are subject to intricate and diverse degradation, +inevitably affecting the effectiveness of underwater visual tasks. However, +most approaches primarily operate in the raw pixel space of images, which +limits the exploration of the frequency characteristics of underwater images, +leading to an inadequate utilization of deep models' representational +capabilities in producing high-quality images. In this paper, we introduce a +novel Underwater Image Enhancement (UIE) framework, named WF-Diff, designed to +fully leverage the characteristics of frequency domain information and +diffusion models. WF-Diff consists of two detachable networks: Wavelet-based +Fourier information interaction network (WFI2-net) and Frequency Residual +Diffusion Adjustment Module (FRDAM). With our full exploration of the frequency +domain information, WFI2-net aims to achieve preliminary enhancement of +frequency information in the wavelet space. Our proposed FRDAM can further +refine the high- and low-frequency information of the initial enhanced images, +which can be viewed as a plug-and-play universal module to adjust the detail of +the underwater images. With the above techniques, our algorithm can show SOTA +performance on real-world underwater image datasets, and achieves competitive +performance in visual quality. + +
+
+
+
+
+ + ☆ Self-training solutions for the ICCV 2023 GeoNet Challenge ICCV-2023 + + +
+ GeoNet is a recently proposed domain adaptation benchmark consisting of three +challenges (i.e., GeoUniDA, GeoImNet, and GeoPlaces). Each challenge contains +images collected from the USA and Asia where there are huge geographical gaps. +Our solution adopts a two-stage source-free domain adaptation framework with a +Swin Transformer backbone to achieve knowledge transfer from the USA (source) +domain to Asia (target) domain. In the first stage, we train a source model +using labeled source data with a re-sampling strategy and two types of +cross-entropy loss. In the second stage, we generate pseudo labels for +unlabeled target data to fine-tune the model. Our method achieves an H-score of +74.56% and ultimately ranks 1st in the GeoUniDA challenge. In GeoImNet and +GeoPlaces challenges, our solution also reaches a top-3 accuracy of 64.46% and +51.23%, respectively. + +
+
+ comment: technical report; 1st in the ICCV-2023 GeoUniDA challenge +
+
+
+
+
+ + ☆ Beyond Hallucinations: Enhancing LVLMs through Hallucination-Aware + Direct Preference Optimization + + +
+ Multimodal large language models have made significant advancements in recent +years, yet they still suffer from a common issue known as the "hallucination +problem" where the models generate textual descriptions that contain inaccurate +or non-existent content from the image. To address this issue, this paper +introduces a novel strategy: Hallucination-Aware Direct Preference Optimization +(HA-DPO). Our approach treats the hallucination problem as a unique preference +selection issue, where the model is trained to favor the non-hallucinating +response when presented with two responses of the same image (one accurate and +one hallucinating). This paper also presents an efficient process for +constructing hallucination sample pairs to ensure high-quality, +style-consistent pairs for stable HA-DPO training. We applied this strategy to +two mainstream multimodal models, and the results showed a significant +reduction in the hallucination problem and an enhancement in the models' +generalization capabilities. With HA-DPO, the MiniGPT-4 model demonstrates +significant advancements: POPE accuracy increases from 51.13% to 85.66% (34.5% +absolute improvement), and the MME score escalates from 968.58 to 1365.76 (41% +relative improvement). The code, models, and datasets will be made publicly +available. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Unified-modal Salient Object Detection via Adaptive Prompt Learning + + +
+ Existing single-modal and multi-modal salient object detection (SOD) methods +focus on designing specific architectures tailored for their respective tasks. +However, developing completely different models for different tasks leads to +labor and time consumption, as well as high computational and practical +deployment costs. In this paper, we make the first attempt to address both +single-modal and multi-modal SOD in a unified framework called UniSOD. +Nevertheless, assigning appropriate strategies to modality variable inputs is +challenging. To this end, UniSOD learns modality-aware prompts with +task-specific hints through adaptive prompt learning, which are plugged into +the proposed pre-trained baseline SOD model to handle corresponding tasks, +while only requiring few learnable parameters compared to training the entire +model. Each modality-aware prompt is generated from a switchable prompt +generation block, which performs structural switching solely relied on +single-modal and multi-modal inputs. UniSOD achieves consistent performance +improvement on 14 benchmark datasets for RGB, RGB-D, and RGB-T SOD, which +demonstrates that our method effectively and efficiently unifies single-modal +and multi-modal SOD tasks. + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ 1-Lipschitz Layers Compared: Memory, Speed, and Certifiable Robustness + + +
+ The robustness of neural networks against input perturbations with bounded +magnitude represents a serious concern in the deployment of deep learning +models in safety-critical systems. Recently, the scientific community has +focused on enhancing certifiable robustness guarantees by crafting 1-Lipschitz +neural networks that leverage Lipschitz bounded dense and convolutional layers. +Although different methods have been proposed in the literature to achieve this +goal, understanding the performance of such methods is not straightforward, +since different metrics can be relevant (e.g., training time, memory usage, +accuracy, certifiable robustness) for different applications. For this reason, +this work provides a thorough theoretical and empirical comparison between +methods by evaluating them in terms of memory usage, speed, and certifiable +robust accuracy. The paper also provides some guidelines and recommendations to +support the user in selecting the methods that work best depending on the +available resources. We provide code at +https://github.com/berndprach/1LipschitzLayersCompared. + +
+
+
+
+
+ + ☆ Decomposer: Semi-supervised Learning of Image Restoration and Image + Decomposition + + +
+ We present Decomposer, a semi-supervised reconstruction model that decomposes +distorted image sequences into their fundamental building blocks - the original +image and the applied augmentations, i.e., shadow, light, and occlusions. To +solve this problem, we use the SIDAR dataset that provides a large number of +distorted image sequences: each sequence contains images with shadows, +lighting, and occlusions applied to an undistorted version. Each distortion +changes the original signal in different ways, e.g., additive or multiplicative +noise. We propose a transformer-based model to explicitly learn this +decomposition. The sequential model uses 3D Swin-Transformers for +spatio-temporal encoding and 3D U-Nets as prediction heads for individual parts +of the decomposition. We demonstrate that by separately pre-training our model +on weakly supervised pseudo labels, we can steer our model to optimize for our +ambiguous problem definition and learn to differentiate between the different +image distortions. + +
+
+
+
+
+ + ☆ SARA: Controllable Makeup Transfer with Spatial Alignment and + Region-Adaptive Normalization + + +
+ Makeup transfer is a process of transferring the makeup style from a +reference image to the source images, while preserving the source images' +identities. This technique is highly desirable and finds many applications. +However, existing methods lack fine-level control of the makeup style, making +it challenging to achieve high-quality results when dealing with large spatial +misalignments. To address this problem, we propose a novel Spatial Alignment +and Region-Adaptive normalization method (SARA) in this paper. Our method +generates detailed makeup transfer results that can handle large spatial +misalignments and achieve part-specific and shade-controllable makeup transfer. +Specifically, SARA comprises three modules: Firstly, a spatial alignment module +that preserves the spatial context of makeup and provides a target semantic map +for guiding the shape-independent style codes. Secondly, a region-adaptive +normalization module that decouples shape and makeup style using per-region +encoding and normalization, which facilitates the elimination of spatial +misalignments. Lastly, a makeup fusion module blends identity features and +makeup style by injecting learned scale and bias parameters. Experimental +results show that our SARA method outperforms existing methods and achieves +state-of-the-art performance on two public datasets. + +
+
+
+
+
+ + ☆ Denoising Diffusion Probabilistic Models for Image Inpainting of Cell + Distributions in the Human Brain + + +
+ Recent advances in imaging and high-performance computing have made it +possible to image the entire human brain at the cellular level. This is the +basis to study the multi-scale architecture of the brain regarding its +subdivision into brain areas and nuclei, cortical layers, columns, and cell +clusters down to single cell morphology Methods for brain mapping and cell +segmentation exploit such images to enable rapid and automated analysis of +cytoarchitecture and cell distribution in complete series of histological +sections. However, the presence of inevitable processing artifacts in the image +data caused by missing sections, tears in the tissue, or staining variations +remains the primary reason for gaps in the resulting image data. To this end we +aim to provide a model that can fill in missing information in a reliable way, +following the true cell distribution at different scales. Inspired by the +recent success in image generation, we propose a denoising diffusion +probabilistic model (DDPM), trained on light-microscopic scans of cell-body +stained sections. We extend this model with the RePaint method to impute +missing or replace corrupted image data. We show that our trained DDPM is able +to generate highly realistic image information for this purpose, generating +plausible cell statistics and cytoarchitectonic patterns. We validate its +outputs using two established downstream task models trained on the same data. + +
+
+ comment: Submitted to ISBI-2024 +
+
+
+
+
+ + ☆ DI-Net : Decomposed Implicit Garment Transfer Network for Digital + Clothed 3D Human + + +
+ 3D virtual try-on enjoys many potential applications and hence has attracted +wide attention. However, it remains a challenging task that has not been +adequately solved. Existing 2D virtual try-on methods cannot be directly +extended to 3D since they lack the ability to perceive the depth of each pixel. +Besides, 3D virtual try-on approaches are mostly built on the fixed topological +structure and with heavy computation. To deal with these problems, we propose a +Decomposed Implicit garment transfer network (DI-Net), which can effortlessly +reconstruct a 3D human mesh with the newly try-on result and preserve the +texture from an arbitrary perspective. Specifically, DI-Net consists of two +modules: 1) A complementary warping module that warps the reference image to +have the same pose as the source image through dense correspondence learning +and sparse flow learning; 2) A geometry-aware decomposed transfer module that +decomposes the garment transfer into image layout based transfer and texture +based transfer, achieving surface and texture reconstruction by constructing +pixel-aligned implicit functions. Experimental results show the effectiveness +and superiority of our method in the 3D virtual try-on task, which can yield +more high-quality results over other existing methods. + +
+
+
+
+
+ + ☆ Panacea: Panoramic and Controllable Video Generation for Autonomous + Driving + + +
+ The field of autonomous driving increasingly demands high-quality annotated +training data. In this paper, we propose Panacea, an innovative approach to +generate panoramic and controllable videos in driving scenarios, capable of +yielding an unlimited numbers of diverse, annotated samples pivotal for +autonomous driving advancements. Panacea addresses two critical challenges: +'Consistency' and 'Controllability.' Consistency ensures temporal and +cross-view coherence, while Controllability ensures the alignment of generated +content with corresponding annotations. Our approach integrates a novel 4D +attention and a two-stage generation pipeline to maintain coherence, +supplemented by the ControlNet framework for meticulous control by the +Bird's-Eye-View (BEV) layouts. Extensive qualitative and quantitative +evaluations of Panacea on the nuScenes dataset prove its effectiveness in +generating high-quality multi-view driving-scene videos. This work notably +propels the field of autonomous driving by effectively augmenting the training +dataset used for advanced BEV perception techniques. + +
+
+ comment: Project page: https://panacea-ad.github.io/ +
+
+
+
+
+ + ☆ The curse of language biases in remote sensing VQA: the role of spatial + attributes, language diversity, and the need for clear evaluation + + +
+ Remote sensing visual question answering (RSVQA) opens new opportunities for +the use of overhead imagery by the general public, by enabling human-machine +interaction with natural language. Building on the recent advances in natural +language processing and computer vision, the goal of RSVQA is to answer a +question formulated in natural language about a remote sensing image. Language +understanding is essential to the success of the task, but has not yet been +thoroughly examined in RSVQA. In particular, the problem of language biases is +often overlooked in the remote sensing community, which can impact model +robustness and lead to wrong conclusions about the performances of the model. +Thus, the present work aims at highlighting the problem of language biases in +RSVQA with a threefold analysis strategy: visual blind models, adversarial +testing and dataset analysis. This analysis focuses both on model and data. +Moreover, we motivate the use of more informative and complementary evaluation +metrics sensitive to the issue. The gravity of language biases in RSVQA is then +exposed for all of these methods with the training of models discarding the +image data and the manipulation of the visual input during inference. Finally, +a detailed analysis of question-answer distribution demonstrates the root of +the problem in the data itself. Thanks to this analytical study, we observed +that biases in remote sensing are more severe than in standard VQA, likely due +to the specifics of existing remote sensing datasets for the task, e.g. +geographical similarities and sparsity, as well as a simpler vocabulary and +question generation strategies. While new, improved and less-biased datasets +appear as a necessity for the development of the promising field of RSVQA, we +demonstrate that more informed, relative evaluation metrics remain much needed +to transparently communicate results of future RSVQA methods. + +
+
+
+
+
+ + ☆ Multi-Channel Cross Modal Detection of Synthetic Face Images + + +
+ Synthetically generated face images have shown to be indistinguishable from +real images by humans and as such can lead to a lack of trust in digital +content as they can, for instance, be used to spread misinformation. Therefore, +the need to develop algorithms for detecting entirely synthetic face images is +apparent. Of interest are images generated by state-of-the-art deep +learning-based models, as these exhibit a high level of visual realism. Recent +works have demonstrated that detecting such synthetic face images under +realistic circumstances remains difficult as new and improved generative models +are proposed with rapid speed and arbitrary image post-processing can be +applied. In this work, we propose a multi-channel architecture for detecting +entirely synthetic face images which analyses information both in the frequency +and visible spectra using Cross Modal Focal Loss. We compare the proposed +architecture with several related architectures trained using Binary Cross +Entropy and show in cross-model experiments that the proposed architecture +supervised using Cross Modal Focal Loss, in general, achieves most competitive +performance. + +
+
+
+
+
+ + ☆ Rescuing referral failures during automated diagnosis of domain-shifted + medical images + + +
+ The success of deep learning models deployed in the real world depends +critically on their ability to generalize well across diverse data domains. +Here, we address a fundamental challenge with selective classification during +automated diagnosis with domain-shifted medical images. In this scenario, +models must learn to avoid making predictions when label confidence is low, +especially when tested with samples far removed from the training set +(covariate shift). Such uncertain cases are typically referred to the clinician +for further analysis and evaluation. Yet, we show that even state-of-the-art +domain generalization approaches fail severely during referral when tested on +medical images acquired from a different demographic or using a different +technology. We examine two benchmark diagnostic medical imaging datasets +exhibiting strong covariate shifts: i) diabetic retinopathy prediction with +retinal fundus images and ii) multilabel disease prediction with chest X-ray +images. We show that predictive uncertainty estimates do not generalize well +under covariate shifts leading to non-monotonic referral curves, and severe +drops in performance (up to 50%) at high referral rates (>70%). We evaluate +novel combinations of robust generalization and post hoc referral approaches, +that rescue these failures and achieve significant performance improvements, +typically >10%, over baseline methods. Our study identifies a critical +challenge with referral in domain-shifted medical images and finds key +applications in reliable, automated disease diagnosis. + +
+
+
+
+
+ + ☆ Gradient-based Local Next-best-view Planning for Improved Perception of + Targeted Plant Nodes + + +
+ Robots are increasingly used in tomato greenhouses to automate +labour-intensive tasks such as selective harvesting and de-leafing. To perform +these tasks, robots must be able to accurately and efficiently perceive the +plant nodes that need to be cut, despite the high levels of occlusion from +other plant parts. We formulate this problem as a local next-best-view (NBV) +planning task where the robot has to plan an efficient set of camera viewpoints +to overcome occlusion and improve the quality of perception. Our formulation +focuses on quickly improving the perception accuracy of a single target node to +maximise its chances of being cut. Previous methods of NBV planning mostly +focused on global view planning and used random sampling of candidate +viewpoints for exploration, which could suffer from high computational costs, +ineffective view selection due to poor candidates, or non-smooth trajectories +due to inefficient sampling. We propose a gradient-based NBV planner using +differential ray sampling, which directly estimates the local gradient +direction for viewpoint planning to overcome occlusion and improve perception. +Through simulation experiments, we showed that our planner can handle +occlusions and improve the 3D reconstruction and position estimation of nodes +equally well as a sampling-based NBV planner, while taking ten times less +computation and generating 28% more efficient trajectories. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Towards Full-scene Domain Generalization in Multi-agent Collaborative + Bird's Eye View Segmentation for Connected and Autonomous Driving + + +
+ Collaborative perception has recently gained significant attention in +autonomous driving, improving perception quality by enabling the exchange of +additional information among vehicles. However, deploying collaborative +perception systems can lead to domain shifts due to diverse environmental +conditions and data heterogeneity among connected and autonomous vehicles +(CAVs). To address these challenges, we propose a unified domain generalization +framework applicable in both training and inference stages of collaborative +perception. In the training phase, we introduce an Amplitude Augmentation +(AmpAug) method to augment low-frequency image variations, broadening the +model's ability to learn across various domains. We also employ a +meta-consistency training scheme to simulate domain shifts, optimizing the +model with a carefully designed consistency loss to encourage domain-invariant +representations. In the inference phase, we introduce an intra-system domain +alignment mechanism to reduce or potentially eliminate the domain discrepancy +among CAVs prior to inference. Comprehensive experiments substantiate the +effectiveness of our method in comparison with the existing state-of-the-art +works. Code will be released at https://github.com/DG-CAVs/DG-CoPerception.git. + +
+
+
+
+
+ + ☆ As-Plausible-As-Possible: Plausibility-Aware Mesh Deformation Using 2D + Diffusion Priors + + +
+ We present As-Plausible-as-Possible (APAP) mesh deformation technique that +leverages 2D diffusion priors to preserve the plausibility of a mesh under +user-controlled deformation. Our framework uses per-face Jacobians to represent +mesh deformations, where mesh vertex coordinates are computed via a +differentiable Poisson Solve. The deformed mesh is rendered, and the resulting +2D image is used in the Score Distillation Sampling (SDS) process, which +enables extracting meaningful plausibility priors from a pretrained 2D +diffusion model. To better preserve the identity of the edited mesh, we +fine-tune our 2D diffusion model with LoRA. Gradients extracted by SDS and a +user-prescribed handle displacement are then backpropagated to the per-face +Jacobians, and we use iterative gradient descent to compute the final +deformation that balances between the user edit and the output plausibility. We +evaluate our method with 2D and 3D meshes and demonstrate qualitative and +quantitative improvements when using plausibility priors over +geometry-preservation or distortion-minimization priors used by previous +techniques. + +
+
+ comment: Project page: https://as-plausible-as-possible.github.io/ +
+
+
+
+
+ + ☆ Riemannian Self-Attention Mechanism for SPD Networks + + +
+ Symmetric positive definite (SPD) matrix has been demonstrated to be an +effective feature descriptor in many scientific areas, as it can encode +spatiotemporal statistics of the data adequately on a curved Riemannian +manifold, i.e., SPD manifold. Although there are many different ways to design +network architectures for SPD matrix nonlinear learning, very few solutions +explicitly mine the geometrical dependencies of features at different layers. +Motivated by the great success of self-attention mechanism in capturing +long-range relationships, an SPD manifold self-attention mechanism (SMSA) is +proposed in this paper using some manifold-valued geometric operations, mainly +the Riemannian metric, Riemannian mean, and Riemannian optimization. Then, an +SMSA-based geometric learning module (SMSA-GLM) is designed for the sake of +improving the discrimination of the generated deep structured representations. +Extensive experimental results achieved on three benchmarking datasets show +that our modification against the baseline network further alleviates the +information degradation problem and leads to improved accuracy. + +
+
+ comment: 14 pages, 10 figures, 5 tables +
+
+
+
+
+ + ☆ Point'n Move: Interactive Scene Object Manipulation on Gaussian + Splatting Radiance Fields + + +
+ We propose Point'n Move, a method that achieves interactive scene object +manipulation with exposed region inpainting. Interactivity here further comes +from intuitive object selection and real-time editing. To achieve this, we +adopt Gaussian Splatting Radiance Field as the scene representation and fully +leverage its explicit nature and speed advantage. Its explicit representation +formulation allows us to devise a 2D prompt points to 3D mask dual-stage +self-prompting segmentation algorithm, perform mask refinement and merging, +minimize change as well as provide good initialization for scene inpainting and +perform editing in real-time without per-editing training, all leads to +superior quality and performance. We test our method by performing editing on +both forward-facing and 360 scenes. We also compare our method against existing +scene object removal methods, showing superior quality despite being more +capable and having a speed advantage. + +
+
+
+
+
+ + ☆ Photo-SLAM: Real-time Simultaneous Localization and Photorealistic + Mapping for Monocular, Stereo, and RGB-D Cameras + + +
+ The integration of neural rendering and the SLAM system recently showed +promising results in joint localization and photorealistic view reconstruction. +However, existing methods, fully relying on implicit representations, are so +resource-hungry that they cannot run on portable devices, which deviates from +the original intention of SLAM. In this paper, we present Photo-SLAM, a novel +SLAM framework with a hyper primitives map. Specifically, we simultaneously +exploit explicit geometric features for localization and learn implicit +photometric features to represent the texture information of the observed +environment. In addition to actively densifying hyper primitives based on +geometric features, we further introduce a Gaussian-Pyramid-based training +method to progressively learn multi-level features, enhancing photorealistic +mapping performance. The extensive experiments with monocular, stereo, and +RGB-D datasets prove that our proposed system Photo-SLAM significantly +outperforms current state-of-the-art SLAM systems for online photorealistic +mapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times +faster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time +speed using an embedded platform such as Jetson AGX Orin, showing the potential +of robotics applications. + +
+
+
+
+
+ + ☆ Embodied Multi-Modal Agent trained by an LLM from a Parallel TextWorld + + +
+ While large language models (LLMs) excel in a simulated world of texts, they +struggle to interact with the more realistic world without perceptions of other +modalities such as visual or audio signals. Although vision-language models +(VLMs) integrate LLM modules (1) aligned with static image features, and (2) +may possess prior knowledge of world dynamics (as demonstrated in the text +world), they have not been trained in an embodied visual world and thus cannot +align with its dynamics. On the other hand, training an embodied agent in a +noisy visual world without expert guidance is often challenging and +inefficient. In this paper, we train a VLM agent living in a visual world using +an LLM agent excelling in a parallel text world (but inapplicable to the visual +world). Specifically, we distill LLM's reflection outcomes (improved actions by +analyzing mistakes) in a text world's tasks to finetune the VLM on the same +tasks of the visual world, resulting in an Embodied Multi-Modal Agent (EMMA) +quickly adapting to the visual world dynamics. Such cross-modality imitation +learning between the two parallel worlds enables EMMA to generalize to a broad +scope of new tasks without any further guidance from the LLM expert. Extensive +evaluations on the ALFWorld benchmark highlight EMMA's superior performance to +SOTA VLM-based agents across diverse tasks, e.g., 20%-70% improvement in the +success rate. + +
+
+
+
+
+ + ☆ LEDITS++: Limitless Image Editing using Text-to-Image Models + + +
+ Text-to-image diffusion models have recently received increasing interest for +their astonishing ability to produce high-fidelity images from solely text +inputs. Subsequent research efforts aim to exploit and apply their capabilities +to real image editing. However, existing image-to-image methods are often +inefficient, imprecise, and of limited versatility. They either require +time-consuming fine-tuning, deviate unnecessarily strongly from the input +image, and/or lack support for multiple, simultaneous edits. To address these +issues, we introduce LEDITS++, an efficient yet versatile and precise textual +image manipulation technique. LEDITS++'s novel inversion approach requires no +tuning nor optimization and produces high-fidelity results with a few diffusion +steps. Second, our methodology supports multiple simultaneous edits and is +architecture-agnostic. Third, we use a novel implicit masking technique that +limits changes to relevant image regions. We propose the novel TEdBench++ +benchmark as part of our exhaustive evaluation. Our results demonstrate the +capabilities of LEDITS++ and its improvements over previous methods. The +project page is available at https://leditsplusplus-project.static.hf.space . + +
+
+
+
+
+ + ☆ Full-resolution MLPs Empower Medical Dense Prediction + + +
+ Dense prediction is a fundamental requirement for many medical vision tasks +such as medical image restoration, registration, and segmentation. The most +popular vision model, Convolutional Neural Networks (CNNs), has reached +bottlenecks due to the intrinsic locality of convolution operations. Recently, +transformers have been widely adopted for dense prediction for their capability +to capture long-range visual dependence. However, due to the high computational +complexity and large memory consumption of self-attention operations, +transformers are usually used at downsampled feature resolutions. Such usage +cannot effectively leverage the tissue-level textural information available +only at the full image resolution. This textural information is crucial for +medical dense prediction as it can differentiate the subtle human anatomy in +medical images. In this study, we hypothesize that Multi-layer Perceptrons +(MLPs) are superior alternatives to transformers in medical dense prediction +where tissue-level details dominate the performance, as MLPs enable long-range +dependence at the full image resolution. To validate our hypothesis, we develop +a full-resolution hierarchical MLP framework that uses MLPs beginning from the +full image resolution. We evaluate this framework with various MLP blocks on a +wide range of medical dense prediction tasks including restoration, +registration, and segmentation. Extensive experiments on six public +well-benchmarked datasets show that, by simply using MLPs at full resolution, +our framework outperforms its CNN and transformer counterparts and achieves +state-of-the-art performance on various medical dense prediction tasks. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD + Programs + + +
+ CAD programs are a popular way to compactly encode shapes as a sequence of +operations that are easy to parametrically modify. However, without sufficient +semantic comments and structure, such programs can be challenging to +understand, let alone modify. We introduce the problem of semantic commenting +CAD programs, wherein the goal is to segment the input program into code blocks +corresponding to semantically meaningful shape parts and assign a semantic +label to each block. We solve the problem by combining program parsing with +visual-semantic analysis afforded by recent advances in foundational language +and vision models. Specifically, by executing the input programs, we create +shapes, which we use to generate conditional photorealistic images to make use +of semantic annotators for such images. We then distill the information across +the images and link back to the original programs to semantically comment on +them. Additionally, we collected and annotated a benchmark dataset, CADTalk, +consisting of 5,280 machine-made programs and 45 human-made programs with +ground truth semantic comments to foster future research. We extensively +evaluated our approach, compared to a GPT-based baseline approach, and an +open-set shape segmentation baseline, i.e., PartSLIP, and reported an 83.24% +accuracy on the new CADTalk dataset. Project page: +https://enigma-li.github.io/CADTalk/. + +
+
+
+
+
+ + ☆ Rethinking Intermediate Layers design in Knowledge Distillation for + Kidney and Liver Tumor Segmentation + + +
+ Knowledge distillation(KD) has demonstrated remarkable success across various +domains, but its application to medical imaging tasks, such as kidney and liver +tumor segmentation, has encountered challenges. Many existing KD methods are +not specifically tailored for these tasks. Moreover, prevalent KD methods often +lack a careful consideration of what and from where to distill knowledge from +the teacher to the student. This oversight may lead to issues like the +accumulation of training bias within shallower student layers, potentially +compromising the effectiveness of KD. To address these challenges, we propose +Hierarchical Layer-selective Feedback Distillation (HLFD). HLFD strategically +distills knowledge from a combination of middle layers to earlier layers and +transfers final layer knowledge to intermediate layers at both the feature and +pixel levels. This design allows the model to learn higher-quality +representations from earlier layers, resulting in a robust and compact student +model. Extensive quantitative evaluations reveal that HLFD outperforms existing +methods by a significant margin. For example, in the kidney segmentation task, +HLFD surpasses the student model (without KD) by over 10pp, significantly +improving its focus on tumor-specific features. From a qualitative standpoint, +the student model trained using HLFD excels at suppressing irrelevant +information and can focus sharply on tumor-specific details, which opens a new +pathway for more efficient and accurate diagnostic tools. + +
+
+ comment: Under-review at ISBI-2024 +
+
+
+
+
+ + ☆ ContextSeg: Sketch Semantic Segmentation by Querying the Context with + Attention + + +
+ Sketch semantic segmentation is a well-explored and pivotal problem in +computer vision involving the assignment of pre-defined part labels to +individual strokes. This paper presents ContextSeg - a simple yet highly +effective approach to tackling this problem with two stages. In the first +stage, to better encode the shape and positional information of strokes, we +propose to predict an extra dense distance field in an autoencoder network to +reinforce structural information learning. In the second stage, we treat an +entire stroke as a single entity and label a group of strokes within the same +semantic part using an auto-regressive Transformer with the default attention +mechanism. By group-based labeling, our method can fully leverage the context +information when making decisions for the remaining groups of strokes. Our +method achieves the best segmentation accuracy compared with state-of-the-art +approaches on two representative datasets and has been extensively evaluated +demonstrating its superior performance. Additionally, we offer insights into +solving part imbalance in training data and the preliminary experiment on +cross-category training, which can inspire future research in this field. + +
+
+
+
+
+ + ☆ Understanding the (Extra-)Ordinary: Validating Deep Model Decisions with + Prototypical Concept-based Explanations + + +
+ Ensuring both transparency and safety is critical when deploying Deep Neural +Networks (DNNs) in high-risk applications, such as medicine. The field of +explainable AI (XAI) has proposed various methods to comprehend the +decision-making processes of opaque DNNs. However, only few XAI methods are +suitable of ensuring safety in practice as they heavily rely on repeated +labor-intensive and possibly biased human assessment. In this work, we present +a novel post-hoc concept-based XAI framework that conveys besides instance-wise +(local) also class-wise (global) decision-making strategies via prototypes. +What sets our approach apart is the combination of local and global strategies, +enabling a clearer understanding of the (dis-)similarities in model decisions +compared to the expected (prototypical) concept use, ultimately reducing the +dependence on human long-term assessment. Quantifying the deviation from +prototypical behavior not only allows to associate predictions with specific +model sub-strategies but also to detect outlier behavior. As such, our approach +constitutes an intuitive and explainable tool for model validation. We +demonstrate the effectiveness of our approach in identifying +out-of-distribution samples, spurious model behavior and data quality issues +across three datasets (ImageNet, CUB-200, and CIFAR-10) utilizing VGG, ResNet, +and EfficientNet architectures. Code is available on +https://github.com/maxdreyer/pcx. + +
+
+ comment: 37 pages (9 pages manuscript, 2 pages references, 26 pages appendix) +
+
+
+
+
+ + ☆ Large Language Models Meet Computer Vision: A Brief Survey + + +
+ Recently, the intersection of Large Language Models (LLMs) and Computer +Vision (CV) has emerged as a pivotal area of research, driving significant +advancements in the field of Artificial Intelligence (AI). As transformers have +become the backbone of many state-of-the-art models in both Natural Language +Processing (NLP) and CV, understanding their evolution and potential +enhancements is crucial. This survey paper delves into the latest progressions +in the domain of transformers and their subsequent successors, emphasizing +their potential to revolutionize Vision Transformers (ViTs) and LLMs. This +survey also presents a comparative analysis, juxtaposing the performance +metrics of several leading paid and open-source LLMs, shedding light on their +strengths and areas of improvement as well as a literature review on how LLMs +are being used to tackle vision related tasks. Furthermore, the survey presents +a comprehensive collection of datasets employed to train LLMs, offering +insights into the diverse data available to achieve high performance in various +pre-training and downstream tasks of LLMs. The survey is concluded by +highlighting open directions in the field, suggesting potential venues for +future research and development. This survey aims to underscores the profound +intersection of LLMs on CV, leading to a new era of integrated and advanced AI +models. + +
+
+
+
+
+ + ☆ SplitNeRF: Split Sum Approximation Neural Field for Joint Geometry, + Illumination, and Material Estimation + + +
+ We present a novel approach for digitizing real-world objects by estimating +their geometry, material properties, and environmental lighting from a set of +posed images with fixed lighting. Our method incorporates into Neural Radiance +Field (NeRF) pipelines the split sum approximation used with image-based +lighting for real-time physical-based rendering. We propose modeling the +scene's lighting with a single scene-specific MLP representing pre-integrated +image-based lighting at arbitrary resolutions. We achieve accurate modeling of +pre-integrated lighting by exploiting a novel regularizer based on efficient +Monte Carlo sampling. Additionally, we propose a new method of supervising +self-occlusion predictions by exploiting a similar regularizer based on Monte +Carlo sampling. Experimental results demonstrate the efficiency and +effectiveness of our approach in estimating scene geometry, material +properties, and lighting. Our method is capable of attaining state-of-the-art +relighting quality after only ${\sim}1$ hour of training in a single NVIDIA +A100 GPU. + +
+
+
+
+
+ + ☆ LiveNVS: Neural View Synthesis on Live RGB-D Streams SIGGRAPH + + +
+ Existing real-time RGB-D reconstruction approaches, like Kinect Fusion, lack +real-time photo-realistic visualization. This is due to noisy, oversmoothed or +incomplete geometry and blurry textures which are fused from imperfect depth +maps and camera poses. Recent neural rendering methods can overcome many of +such artifacts but are mostly optimized for offline usage, hindering the +integration into a live reconstruction pipeline. + In this paper, we present LiveNVS, a system that allows for neural novel view +synthesis on a live RGB-D input stream with very low latency and real-time +rendering. Based on the RGB-D input stream, novel views are rendered by +projecting neural features into the target view via a densely fused depth map +and aggregating the features in image-space to a target feature map. A +generalizable neural network then translates the target feature map into a +high-quality RGB image. LiveNVS achieves state-of-the-art neural rendering +quality of unknown scenes during capturing, allowing users to virtually explore +the scene and assess reconstruction quality in real-time. + +
+
+ comment: main paper: 8 pages, total number of pages: 15, 13 figures, to be + published in SIGGRAPH Asia 2023 Conference Papers +
+
+
+
+
+ + ☆ DGNR: Density-Guided Neural Point Rendering of Large Driving Scenes + + +
+ Despite the recent success of Neural Radiance Field (NeRF), it is still +challenging to render large-scale driving scenes with long trajectories, +particularly when the rendering quality and efficiency are in high demand. +Existing methods for such scenes usually involve with spatial warping, +geometric supervision from zero-shot normal or depth estimation, or scene +division strategies, where the synthesized views are often blurry or fail to +meet the requirement of efficient rendering. To address the above challenges, +this paper presents a novel framework that learns a density space from the +scenes to guide the construction of a point-based renderer, dubbed as DGNR +(Density-Guided Neural Rendering). In DGNR, geometric priors are no longer +needed, which can be intrinsically learned from the density space through +volumetric rendering. Specifically, we make use of a differentiable renderer to +synthesize images from the neural density features obtained from the learned +density space. A density-based fusion module and geometric regularization are +proposed to optimize the density space. By conducting experiments on a widely +used autonomous driving dataset, we have validated the effectiveness of DGNR in +synthesizing photorealistic driving scenes and achieving real-time capable +rendering. + +
+
+
+
+
+ + ☆ SCALAR-NeRF: SCAlable LARge-scale Neural Radiance Fields for Scene + Reconstruction SC + + +
+ In this work, we introduce SCALAR-NeRF, a novel framework tailored for +scalable large-scale neural scene reconstruction. We structure the neural +representation as an encoder-decoder architecture, where the encoder processes +3D point coordinates to produce encoded features, and the decoder generates +geometric values that include volume densities of signed distances and colors. +Our approach first trains a coarse global model on the entire image dataset. +Subsequently, we partition the images into smaller blocks using KMeans with +each block being modeled by a dedicated local model. We enhance the overlapping +regions across different blocks by scaling up the bounding boxes of each local +block. Notably, the decoder from the global model is shared across distinct +blocks and therefore promoting alignment in the feature space of local +encoders. We propose an effective and efficient methodology to fuse the outputs +from these local models to attain the final reconstruction. Employing this +refined coarse-to-fine strategy, our method outperforms state-of-the-art NeRF +methods and demonstrates scalability for large-scale scene reconstruction. The +code will be available on our project page at +https://aibluefisher.github.io/SCALAR-NeRF/ + +
+
+ comment: Project Page: https://aibluefisher.github.io/SCALAR-NeRF +
+
+
+
+
+ + ☆ Augmenting x-ray single particle imaging reconstruction with + self-supervised machine learning + + +
+ The development of X-ray Free Electron Lasers (XFELs) has opened numerous +opportunities to probe atomic structure and ultrafast dynamics of various +materials. Single Particle Imaging (SPI) with XFELs enables the investigation +of biological particles in their natural physiological states with unparalleled +temporal resolution, while circumventing the need for cryogenic conditions or +crystallization. However, reconstructing real-space structures from +reciprocal-space x-ray diffraction data is highly challenging due to the +absence of phase and orientation information, which is further complicated by +weak scattering signals and considerable fluctuations in the number of photons +per pulse. In this work, we present an end-to-end, self-supervised machine +learning approach to recover particle orientations and estimate reciprocal +space intensities from diffraction images only. Our method demonstrates great +robustness under demanding experimental conditions with significantly enhanced +reconstruction capabilities compared with conventional algorithms, and +signifies a paradigm shift in SPI as currently practiced at XFELs. + +
+
+
+
+
+ + ☆ Parallax-Tolerant Image Stitching with Epipolar Displacement Field + + +
+ Large parallax image stitching is a challenging task. Existing methods often +struggle to maintain both the local and global structures of the image while +reducing alignment artifacts and warping distortions. In this paper, we propose +a novel approach that utilizes epipolar geometry to establish a warping +technique based on the epipolar displacement field. Initially, the warping rule +for pixels in the epipolar geometry is established through the infinite +homography. Subsequently, Subsequently, the epipolar displacement field, which +represents the sliding distance of the warped pixel along the epipolar line, is +formulated by thin plate splines based on the principle of local elastic +deformation. The stitching result can be generated by inversely warping the +pixels according to the epipolar displacement field. This method incorporates +the epipolar constraints in the warping rule, which ensures high-quality +alignment and maintains the projectivity of the panorama. Qualitative and +quantitative comparative experiments demonstrate the competitiveness of the +proposed method in stitching images large parallax. + +
+
+
+
+
+ + ☆ MotionZero:Exploiting Motion Priors for Zero-shot Text-to-Video + Generation + + +
+ Zero-shot Text-to-Video synthesis generates videos based on prompts without +any videos. Without motion information from videos, motion priors implied in +prompts are vital guidance. For example, the prompt "airplane landing on the +runway" indicates motion priors that the "airplane" moves downwards while the +"runway" stays static. Whereas the motion priors are not fully exploited in +previous approaches, thus leading to two nontrivial issues: 1) the motion +variation pattern remains unaltered and prompt-agnostic for disregarding motion +priors; 2) the motion control of different objects is inaccurate and entangled +without considering the independent motion priors of different objects. To +tackle the two issues, we propose a prompt-adaptive and disentangled motion +control strategy coined as MotionZero, which derives motion priors from prompts +of different objects by Large-Language-Models and accordingly applies motion +control of different objects to corresponding regions in disentanglement. +Furthermore, to facilitate videos with varying degrees of motion amplitude, we +propose a Motion-Aware Attention scheme which adjusts attention among frames by +motion amplitude. Extensive experiments demonstrate that our strategy could +correctly control motion of different objects and support versatile +applications including zero-shot video edit. + +
+
+
+
+
+ + ☆ Visual Semantic Navigation with Real Robots + + +
+ Visual Semantic Navigation (VSN) is the ability of a robot to learn visual +semantic information for navigating in unseen environments. These VSN models +are typically tested in those virtual environments where they are trained, +mainly using reinforcement learning based approaches. Therefore, we do not yet +have an in-depth analysis of how these models would behave in the real world. +In this work, we propose a new solution to integrate VSN models into real +robots, so that we have true embodied agents. We also release a novel ROS-based +framework for VSN, ROS4VSN, so that any VSN-model can be easily deployed in any +ROS-compatible robot and tested in a real setting. Our experiments with two +different robots, where we have embedded two state-of-the-art VSN agents, +confirm that there is a noticeable performance difference of these VSN +solutions when tested in real-world and simulation environments. We hope that +this research will endeavor to provide a foundation for addressing this +consequential issue, with the ultimate aim of advancing the performance and +efficiency of embodied agents within authentic real-world scenarios. Code to +reproduce all our experiments can be found at +https://github.com/gramuah/ros4vsn. + +
+
+
+
+
+ + ☆ Cross-level Attention with Overlapped Windows for Camouflaged Object + Detection + + +
+ Camouflaged objects adaptively fit their color and texture with the +environment, which makes them indistinguishable from the surroundings. Current +methods revealed that high-level semantic features can highlight the +differences between camouflaged objects and the backgrounds. Consequently, they +integrate high-level semantic features with low-level detailed features for +accurate camouflaged object detection (COD). Unlike previous designs for +multi-level feature fusion, we state that enhancing low-level features is more +impending for COD. In this paper, we propose an overlapped window cross-level +attention (OWinCA) to achieve the low-level feature enhancement guided by the +highest-level features. By sliding an aligned window pair on both the highest- +and low-level feature maps, the high-level semantics are explicitly integrated +into the low-level details via cross-level attention. Additionally, it employs +an overlapped window partition strategy to alleviate the incoherence among +windows, which prevents the loss of global information. These adoptions enable +the proposed OWinCA to enhance low-level features by promoting the separability +of camouflaged objects. The associated proposed OWinCANet fuses these enhanced +multi-level features by simple convolution operation to achieve the final COD. +Experiments conducted on three large-scale COD datasets demonstrate that our +OWinCANet significantly surpasses the current state-of-the-art COD methods. + +
+
+
+
+
+ + ☆ Filter-Pruning of Lightweight Face Detectors Using a Geometric Median + Criterion WACV 2024 + + +
+ Face detectors are becoming a crucial component of many applications, +including surveillance, that often have to run on edge devices with limited +processing power and memory. Therefore, there's a pressing demand for compact +face detection models that can function efficiently across resource-constrained +devices. Over recent years, network pruning techniques have attracted a lot of +attention from researchers. These methods haven't been well examined in the +context of face detectors, despite their expanding popularity. In this paper, +we implement filter pruning on two already small and compact face detectors, +named EXTD (Extremely Tiny Face Detector) and EResFD (Efficient ResNet Face +Detector). The main pruning algorithm that we utilize is Filter Pruning via +Geometric Median (FPGM), combined with the Soft Filter Pruning (SFP) iterative +procedure. We also apply L1 Norm pruning, as a baseline to compare with the +proposed approach. The experimental evaluation on the WIDER FACE dataset +indicates that the proposed approach has the potential to further reduce the +model size of already lightweight face detectors, with limited accuracy loss, +or even with small accuracy gain for low pruning rates. + +
+
+ comment: Accepted for publication in the IEEE/CVF WACV 2024 Workshops + proceedings, Hawaii, USA, Jan. 2024 +
+
+
+
+
+ + ☆ Empowering COVID-19 Detection: Optimizing Performance Through Fine-Tuned + EfficientNet Deep Learning Architecture + + +
+ The worldwide COVID-19 pandemic has profoundly influenced the health and +everyday experiences of individuals across the planet. It is a highly +contagious respiratory disease requiring early and accurate detection to curb +its rapid transmission. Initial testing methods primarily revolved around +identifying the genetic composition of the coronavirus, exhibiting a relatively +low detection rate and requiring a time-intensive procedure. To address this +challenge, experts have suggested using radiological imagery, particularly +chest X-rays, as a valuable approach within the diagnostic protocol. This study +investigates the potential of leveraging radiographic imaging (X-rays) with +deep learning algorithms to swiftly and precisely identify COVID-19 patients. +The proposed approach elevates the detection accuracy by fine-tuning with +appropriate layers on various established transfer learning models. The +experimentation was conducted on a COVID-19 X-ray dataset containing 2000 +images. The accuracy rates achieved were impressive of 100% for EfficientNetB4 +model. The fine-tuned EfficientNetB4 achieved an excellent accuracy score, +showcasing its potential as a robust COVID-19 detection model. Furthermore, +EfficientNetB4 excelled in identifying Lung disease using Chest X-ray dataset +containing 4,350 Images, achieving remarkable performance with an accuracy of +99.17%, precision of 99.13%, recall of 99.16%, and f1-score of 99.14%. These +results highlight the promise of fine-tuned transfer learning for efficient +lung detection through medical imaging, especially with X-ray images. This +research offers radiologists an effective means of aiding rapid and precise +COVID-19 diagnosis and contributes valuable assistance for healthcare +professionals in accurately identifying affected patients. + +
+
+ comment: Computers in Biology and Medicine [Q1, IF: 7.7, CS: 9.2] +
+
+
+
+
+ + ☆ Improving Lane Detection Generalization: A Novel Framework using HD Maps + for Boosting Diversity + + +
+ Lane detection is a vital task for vehicles to navigate and localize their +position on the road. To ensure reliable results, lane detection algorithms +must have robust generalization performance in various road environments. +However, despite the significant performance improvement of deep learning-based +lane detection algorithms, their generalization performance in response to +changes in road environments still falls short of expectations. In this paper, +we present a novel framework for single-source domain generalization (SSDG) in +lane detection. By decomposing data into lane structures and surroundings, we +enhance diversity using High-Definition (HD) maps and generative models. Rather +than expanding data volume, we strategically select a core subset of data, +maximizing diversity and optimizing performance. Our extensive experiments +demonstrate that our framework enhances the generalization performance of lane +detection, comparable to the domain adaptation-based method. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ GeoScaler: Geometry and Rendering-Aware Downsampling of 3D Mesh Textures + + +
+ High-resolution texture maps are necessary for representing real-world +objects accurately with 3D meshes. The large sizes of textures can bottleneck +the real-time rendering of high-quality virtual 3D scenes on devices having low +computational budgets and limited memory. Downsampling the texture maps +directly addresses the issue, albeit at the cost of visual fidelity. +Traditionally, downsampling of texture maps is performed using methods like +bicubic interpolation and the Lanczos algorithm. These methods ignore the +geometric layout of the mesh and its UV parametrization and also do not account +for the rendering process used to obtain the final visualization that the users +will experience. Towards filling these gaps, we introduce GeoScaler, which is a +method of downsampling texture maps of 3D meshes while incorporating geometric +cues, and by maximizing the visual fidelity of the rendered views of the +textured meshes. We show that the textures generated by GeoScaler deliver +significantly better quality rendered images compared to those generated by +traditional downsampling methods + +
+
+
+
+
+ + ☆ Clean Label Disentangling for Medical Image Segmentation with Noisy + Labels + + +
+ Current methods focusing on medical image segmentation suffer from incorrect +annotations, which is known as the noisy label issue. Most medical image +segmentation with noisy labels methods utilize either noise transition matrix, +noise-robust loss functions or pseudo-labeling methods, while none of the +current research focuses on clean label disentanglement. We argue that the main +reason is that the severe class-imbalanced issue will lead to the inaccuracy of +the selected ``clean'' labels, thus influencing the robustness of the model +against the noises. In this work, we come up with a simple but efficient +class-balanced sampling strategy to tackle the class-imbalanced problem, which +enables our newly proposed clean label disentangling framework to successfully +select clean labels from the given label sets and encourages the model to learn +from the correct annotations. However, such a method will filter out too many +annotations which may also contain useful information. Therefore, we further +extend our clean label disentangling framework to a new noisy feature-aided +clean label disentangling framework, which takes the full annotations into +utilization to learn more semantics. Extensive experiments have validated the +effectiveness of our methods, where our methods achieve new state-of-the-art +performance. Our code is available at https://github.com/xiaoyao3302/2BDenoise. + +
+
+ comment: 13 pages, 6 figures, 11 tables +
+
+
+
+
+ + ☆ Efficient Key-Based Adversarial Defense for ImageNet by Using + Pre-trained Model + + +
+ In this paper, we propose key-based defense model proliferation by leveraging +pre-trained models and utilizing recent efficient fine-tuning techniques on +ImageNet-1k classification. First, we stress that deploying key-based models on +edge devices is feasible with the latest model deployment advancements, such as +Apple CoreML, although the mainstream enterprise edge artificial intelligence +(Edge AI) has been focused on the Cloud. Then, we point out that the previous +key-based defense on on-device image classification is impractical for two +reasons: (1) training many classifiers from scratch is not feasible, and (2) +key-based defenses still need to be thoroughly tested on large datasets like +ImageNet. To this end, we propose to leverage pre-trained models and utilize +efficient fine-tuning techniques to proliferate key-based models even on +limited computing resources. Experiments were carried out on the ImageNet-1k +dataset using adaptive and non-adaptive attacks. The results show that our +proposed fine-tuned key-based models achieve a superior classification accuracy +(more than 10% increase) compared to the previous key-based models on +classifying clean and adversarial examples. + +
+
+
+
+
+ + ☆ MobileDiffusion: Subsecond Text-to-Image Generation on Mobile Devices + + +
+ The deployment of large-scale text-to-image diffusion models on mobile +devices is impeded by their substantial model size and slow inference speed. In +this paper, we propose \textbf{MobileDiffusion}, a highly efficient +text-to-image diffusion model obtained through extensive optimizations in both +architecture and sampling techniques. We conduct a comprehensive examination of +model architecture design to reduce redundancy, enhance computational +efficiency, and minimize model's parameter count, while preserving image +generation quality. Additionally, we employ distillation and diffusion-GAN +finetuning techniques on MobileDiffusion to achieve 8-step and 1-step inference +respectively. Empirical studies, conducted both quantitatively and +qualitatively, demonstrate the effectiveness of our proposed techniques. +MobileDiffusion achieves a remarkable \textbf{sub-second} inference speed for +generating a $512\times512$ image on mobile devices, establishing a new state +of the art. + +
+
+
+
+
+ + ☆ Egocentric Whole-Body Motion Capture with FisheyeViT and Diffusion-Based + Motion Refinement + + +
+ In this work, we explore egocentric whole-body motion capture using a single +fisheye camera, which simultaneously estimates human body and hand motion. This +task presents significant challenges due to three factors: the lack of +high-quality datasets, fisheye camera distortion, and human body +self-occlusion. To address these challenges, we propose a novel approach that +leverages FisheyeViT to extract fisheye image features, which are subsequently +converted into pixel-aligned 3D heatmap representations for 3D human body pose +prediction. For hand tracking, we incorporate dedicated hand detection and hand +pose estimation networks for regressing 3D hand poses. Finally, we develop a +diffusion-based whole-body motion prior model to refine the estimated +whole-body motion while accounting for joint uncertainties. To train these +networks, we collect a large synthetic dataset, EgoWholeBody, comprising +840,000 high-quality egocentric images captured across a diverse range of +whole-body motion sequences. Quantitative and qualitative evaluations +demonstrate the effectiveness of our method in producing high-quality +whole-body motion estimates from a single egocentric camera. + +
+
+
+
+
+ + ☆ DiffusionTalker: Personalization and Acceleration for Speech-Driven 3D + Face Diffuser + + +
+ Speech-driven 3D facial animation has been an attractive task in both +academia and industry. Traditional methods mostly focus on learning a +deterministic mapping from speech to animation. Recent approaches start to +consider the non-deterministic fact of speech-driven 3D face animation and +employ the diffusion model for the task. However, personalizing facial +animation and accelerating animation generation are still two major limitations +of existing diffusion-based methods. To address the above limitations, we +propose DiffusionTalker, a diffusion-based method that utilizes contrastive +learning to personalize 3D facial animation and knowledge distillation to +accelerate 3D animation generation. Specifically, to enable personalization, we +introduce a learnable talking identity to aggregate knowledge in audio +sequences. The proposed identity embeddings extract customized facial cues +across different people in a contrastive learning manner. During inference, +users can obtain personalized facial animation based on input audio, reflecting +a specific talking style. With a trained diffusion model with hundreds of +steps, we distill it into a lightweight model with 8 steps for acceleration. +Extensive experiments are conducted to demonstrate that our method outperforms +state-of-the-art methods. The code will be released. + +
+
+
+
+
+ + ☆ Enhancing Scene Text Detectors with Realistic Text Image Synthesis Using + Diffusion Models + + +
+ Scene text detection techniques have garnered significant attention due to +their wide-ranging applications. However, existing methods have a high demand +for training data, and obtaining accurate human annotations is labor-intensive +and time-consuming. As a solution, researchers have widely adopted synthetic +text images as a complementary resource to real text images during +pre-training. Yet there is still room for synthetic datasets to enhance the +performance of scene text detectors. We contend that one main limitation of +existing generation methods is the insufficient integration of foreground text +with the background. To alleviate this problem, we present the Diffusion Model +based Text Generator (DiffText), a pipeline that utilizes the diffusion model +to seamlessly blend foreground text regions with the background's intrinsic +features. Additionally, we propose two strategies to generate visually coherent +text with fewer spelling errors. With fewer text instances, our produced text +images consistently surpass other synthetic data in aiding text detectors. +Extensive experiments on detecting horizontal, rotated, curved, and line-level +texts demonstrate the effectiveness of DiffText in producing realistic text +images. + +
+
+
+
+
+ + ☆ HandyPriors: Physically Consistent Perception of Hand-Object + Interactions with Differentiable Priors + + +
+ Various heuristic objectives for modeling hand-object interaction have been +proposed in past work. However, due to the lack of a cohesive framework, these +objectives often possess a narrow scope of applicability and are limited by +their efficiency or accuracy. In this paper, we propose HandyPriors, a unified +and general pipeline for pose estimation in human-object interaction scenes by +leveraging recent advances in differentiable physics and rendering. Our +approach employs rendering priors to align with input images and segmentation +masks along with physics priors to mitigate penetration and relative-sliding +across frames. Furthermore, we present two alternatives for hand and object +pose estimation. The optimization-based pose estimation achieves higher +accuracy, while the filtering-based tracking, which utilizes the differentiable +priors as dynamics and observation models, executes faster. We demonstrate that +HandyPriors attains comparable or superior results in the pose estimation task, +and that the differentiable physics module can predict contact information for +pose refinement. We also show that our approach generalizes to perception +tasks, including robotic hand manipulation and human-object pose estimation in +the wild. + +
+
+
+
+
+ + ☆ Multi-Irreducible Spectral Synchronization for Robust Rotation Averaging + + +
+ Rotation averaging (RA) is a fundamental problem in robotics and computer +vision. In RA, the goal is to estimate a set of $N$ unknown orientations +$R_{1}, ..., R_{N} \in SO(3)$, given noisy measurements $R_{ij} \sim R^{-1}_{i} +R_{j}$ of a subset of their pairwise relative rotations. This problem is both +nonconvex and NP-hard, and thus difficult to solve in the general case. We +apply harmonic analysis on compact groups to derive a (convex) spectral +relaxation constructed from truncated Fourier decompositions of the individual +summands appearing in the RA objective; we then recover an estimate of the RA +solution by computing a few extremal eigenpairs of this relaxation, and +(approximately) solving a consensus problem. Our approach affords several +notable advantages versus prior RA methods: it can be used in conjunction with +\emph{any} smooth loss function (including, but not limited to, robust +M-estimators), does not require any initialization, and is implemented using +only simple (and highly scalable) linear-algebraic computations and +parallelizable optimizations over band-limited functions of individual +rotational states. Moreover, under the (physically well-motivated) assumption +of multiplicative Langevin measurement noise, we derive explicit performance +guarantees for our spectral estimator (in the form of probabilistic tail bounds +on the estimation error) that are parameterized in terms of graph-theoretic +quantities of the underlying measurement network. By concretely linking +estimator performance with properties of the underlying measurement graph, our +results also indicate how to devise measurement networks that are +\emph{guaranteed} to achieve accurate estimation, enabling such downstream +tasks as sensor placement, network compression, and active sensing. + +
+
+
+
+
+ + ☆ Exploring Straighter Trajectories of Flow Matching with Diffusion + Guidance + + +
+ Flow matching as a paradigm of generative model achieves notable success +across various domains. However, existing methods use either multi-round +training or knowledge within minibatches, posing challenges in finding a +favorable coupling strategy for straight trajectories. To address this issue, +we propose a novel approach, Straighter trajectories of Flow Matching +(StraightFM). It straightens trajectories with the coupling strategy guided by +diffusion model from entire distribution level. First, we propose a coupling +strategy to straighten trajectories, creating couplings between image and noise +samples under diffusion model guidance. Second, StraightFM also integrates real +data to enhance training, employing a neural network to parameterize another +coupling process from images to noise samples. StraightFM is jointly optimized +with couplings from above two mutually complementary directions, resulting in +straighter trajectories and enabling both one-step and few-step generation. +Extensive experiments demonstrate that StraightFM yields high quality samples +with fewer step. StraightFM generates visually appealing images with a lower +FID among diffusion and traditional flow matching methods within 5 sampling +steps when trained on pixel space. In the latent space (i.e., Latent +Diffusion), StraightFM achieves a lower KID value compared to existing methods +on the CelebA-HQ 256 dataset in fewer than 10 sampling steps. + +
+
+
+
+
+ + ☆ Agents meet OKR: An Object and Key Results Driven Agent System with + Hierarchical Self-Collaboration and Self-Evaluation + + +
+ In this study, we introduce the concept of OKR-Agent designed to enhance the +capabilities of Large Language Models (LLMs) in task-solving. Our approach +utilizes both self-collaboration and self-correction mechanism, facilitated by +hierarchical agents, to address the inherent complexities in task-solving. Our +key observations are two-fold: first, effective task-solving demands in-depth +domain knowledge and intricate reasoning, for which deploying specialized +agents for individual sub-tasks can markedly enhance LLM performance. Second, +task-solving intrinsically adheres to a hierarchical execution structure, +comprising both high-level strategic planning and detailed task execution. +Towards this end, our OKR-Agent paradigm aligns closely with this hierarchical +structure, promising enhanced efficacy and adaptability across a range of +scenarios. Specifically, our framework includes two novel modules: hierarchical +Objects and Key Results generation and multi-level evaluation, each +contributing to more efficient and robust task-solving. In practical, +hierarchical OKR generation decomposes Objects into multiple sub-Objects and +assigns new agents based on key results and agent responsibilities. These +agents subsequently elaborate on their designated tasks and may further +decompose them as necessary. Such generation operates recursively and +hierarchically, culminating in a comprehensive set of detailed solutions. The +multi-level evaluation module of OKR-Agent refines solution by leveraging +feedback from all associated agents, optimizing each step of the process. This +ensures solution is accurate, practical, and effectively address intricate task +requirements, enhancing the overall reliability and quality of the outcome. +Experimental results also show our method outperforms the previous methods on +several tasks. Code and demo are available at https://okr-agent.github.io/ + +
+
+
+
+
+ + ☆ 3D Teeth Reconstruction from Panoramic Radiographs using Neural Implicit + Functions MICCAI 2023 + + +
+ Panoramic radiography is a widely used imaging modality in dental practice +and research. However, it only provides flattened 2D images, which limits the +detailed assessment of dental structures. In this paper, we propose Occudent, a +framework for 3D teeth reconstruction from panoramic radiographs using neural +implicit functions, which, to the best of our knowledge, is the first work to +do so. For a given point in 3D space, the implicit function estimates whether +the point is occupied by a tooth, and thus implicitly determines the boundaries +of 3D tooth shapes. Firstly, Occudent applies multi-label segmentation to the +input panoramic radiograph. Next, tooth shape embeddings as well as tooth class +embeddings are generated from the segmentation outputs, which are fed to the +reconstruction network. A novel module called Conditional eXcitation (CX) is +proposed in order to effectively incorporate the combined shape and class +embeddings into the implicit function. The performance of Occudent is evaluated +using both quantitative and qualitative measures. Importantly, Occudent is +trained and validated with actual panoramic radiographs as input, distinct from +recent works which used synthesized images. Experiments demonstrate the +superiority of Occudent over state-of-the-art methods. + +
+
+ comment: 12 pages, 2 figures, accepted to International Conference on Medical + Image Computing and Computer-Assisted Intervention MICCAI 2023 +
+
+
+
+
+ + ☆ Efficient Multimodal Diffusion Models Using Joint Data Infilling with + Partially Shared U-Net + + +
+ Recently, diffusion models have been used successfully to fit distributions +for cross-modal data translation and multimodal data generation. However, these +methods rely on extensive scaling, overlooking the inefficiency and +interference between modalities. We develop Partially Shared U-Net (PS-U-Net) +architecture which is an efficient multimodal diffusion model that allows text +and image inputs to pass through dedicated layers and skip-connections for +preserving modality-specific fine-grained details. Inspired by image +inpainting, we also propose a new efficient multimodal sampling method that +introduces new scenarios for conditional generation while only requiring a +simple joint distribution to be learned. Our empirical exploration of the +MS-COCO dataset demonstrates that our method generates multimodal text and +image data with higher quality compared to existing multimodal diffusion models +while having a comparable size, faster training, faster multimodal sampling, +and more flexible generation. + +
+
+
+
+
+ + ☆ A Unified Framework for Multimodal, Multi-Part Human Motion Synthesis + + +
+ The field has made significant progress in synthesizing realistic human +motion driven by various modalities. Yet, the need for different methods to +animate various body parts according to different control signals limits the +scalability of these techniques in practical scenarios. In this paper, we +introduce a cohesive and scalable approach that consolidates multimodal (text, +music, speech) and multi-part (hand, torso) human motion generation. Our +methodology unfolds in several steps: We begin by quantizing the motions of +diverse body parts into separate codebooks tailored to their respective +domains. Next, we harness the robust capabilities of pre-trained models to +transcode multimodal signals into a shared latent space. We then translate +these signals into discrete motion tokens by iteratively predicting subsequent +tokens to form a complete sequence. Finally, we reconstruct the continuous +actual motion from this tokenized sequence. Our method frames the multimodal +motion generation challenge as a token prediction task, drawing from +specialized codebooks based on the modality of the control signal. This +approach is inherently scalable, allowing for the easy integration of new +modalities. Extensive experiments demonstrated the effectiveness of our design, +emphasizing its potential for broad application. + +
+
+ comment: 19 pages, 18 figures +
+
+
+
+
+ + ☆ AvatarGPT: All-in-One Framework for Motion Understanding, Planning, + Generation and Beyond + + +
+ Large Language Models(LLMs) have shown remarkable emergent abilities in +unifying almost all (if not every) NLP tasks. In the human motion-related +realm, however, researchers still develop siloed models for each task. Inspired +by InstuctGPT, and the generalist concept behind Gato, we introduce AvatarGPT, +an All-in-One framework for motion understanding, planning, generations as well +as other tasks such as motion in-between synthesis. AvatarGPT treats each task +as one type of instruction fine-tuned on the shared LLM. All the tasks are +seamlessly interconnected with language as the universal interface, +constituting a closed-loop within the framework. To achieve this, human motion +sequences are first encoded as discrete tokens, which serve as the extended +vocabulary of LLM. Then, an unsupervised pipeline to generate natural language +descriptions of human action sequences from in-the-wild videos is developed. +Finally, all tasks are jointly trained. Extensive experiments show that +AvatarGPT achieves SOTA on low-level tasks, and promising results on high-level +tasks, demonstrating the effectiveness of our proposed All-in-One framework. +Moreover, for the first time, AvatarGPT enables a principled approach by +iterative traversal of the tasks within the closed-loop for unlimited +long-motion synthesis. + +
+
+ comment: 22 pages, 21 figures +
+
+
+
+
+ + ☆ TextDiffuser-2: Unleashing the Power of Language Models for Text + Rendering + + +
+ The diffusion model has been proven a powerful generative model in recent +years, yet remains a challenge in generating visual text. Several methods +alleviated this issue by incorporating explicit text position and content as +guidance on where and what text to render. However, these methods still suffer +from several drawbacks, such as limited flexibility and automation, constrained +capability of layout prediction, and restricted style diversity. In this paper, +we present TextDiffuser-2, aiming to unleash the power of language models for +text rendering. Firstly, we fine-tune a large language model for layout +planning. The large language model is capable of automatically generating +keywords for text rendering and also supports layout modification through +chatting. Secondly, we utilize the language model within the diffusion model to +encode the position and texts at the line level. Unlike previous methods that +employed tight character-level guidance, this approach generates more diverse +text images. We conduct extensive experiments and incorporate user studies +involving human participants as well as GPT-4V, validating TextDiffuser-2's +capacity to achieve a more rational text layout and generation with enhanced +diversity. The code and model will be available at +\url{https://aka.ms/textdiffuser-2}. + +
+
+
+
+
+ + ☆ Bridging the Gap: A Unified Video Comprehension Framework for Moment + Retrieval and Highlight Detection + + +
+ Video Moment Retrieval (MR) and Highlight Detection (HD) have attracted +significant attention due to the growing demand for video analysis. Recent +approaches treat MR and HD as similar video grounding problems and address them +together with transformer-based architecture. However, we observe that the +emphasis of MR and HD differs, with one necessitating the perception of local +relationships and the other prioritizing the understanding of global contexts. +Consequently, the lack of task-specific design will inevitably lead to +limitations in associating the intrinsic specialty of two tasks. To tackle the +issue, we propose a Unified Video COMprehension framework (UVCOM) to bridge the +gap and jointly solve MR and HD effectively. By performing progressive +integration on intra and inter-modality across multi-granularity, UVCOM +achieves the comprehensive understanding in processing a video. Moreover, we +present multi-aspect contrastive learning to consolidate the local relation +modeling and global knowledge accumulation via well aligned multi-modal space. +Extensive experiments on QVHighlights, Charades-STA, TACoS , YouTube Highlights +and TVSum datasets demonstrate the effectiveness and rationality of UVCOM which +outperforms the state-of-the-art methods by a remarkable margin. + +
+
+
+
+
+ + ☆ Viewport Prediction for Volumetric Video Streaming by Exploring Video + Saliency and Trajectory Information + + +
+ Volumetric video, also known as hologram video, is a novel medium that +portrays natural content in Virtual Reality (VR), Augmented Reality (AR), and +Mixed Reality (MR). It is expected to be the next-gen video technology and a +prevalent use case for 5G and beyond wireless communication. Considering that +each user typically only watches a section of the volumetric video, known as +the viewport, it is essential to have precise viewport prediction for optimal +performance. However, research on this topic is still in its infancy. In the +end, this paper presents and proposes a novel approach, named Saliency and +Trajectory Viewport Prediction (STVP), which aims to improve the precision of +viewport prediction in volumetric video streaming. The STVP extensively +utilizes video saliency information and viewport trajectory. To our knowledge, +this is the first comprehensive study of viewport prediction in volumetric +video streaming. In particular, we introduce a novel sampling method, Uniform +Random Sampling (URS), to reduce computational complexity while still +preserving video features in an efficient manner. Then we present a saliency +detection technique that incorporates both spatial and temporal information for +detecting static, dynamic geometric, and color salient regions. Finally, we +intelligently fuse saliency and trajectory information to achieve more accurate +viewport prediction. We conduct extensive simulations to evaluate the +effectiveness of our proposed viewport prediction methods using +state-of-the-art volumetric video sequences. The experimental results show the +superiority of the proposed method over existing schemes. The dataset and +source code will be publicly accessible after acceptance. + +
+
+
+
+
+ + ☆ Spiking Neural Networks with Dynamic Time Steps for Vision Transformers + + +
+ Spiking Neural Networks (SNNs) have emerged as a popular spatio-temporal +computing paradigm for complex vision tasks. Recently proposed SNN training +algorithms have significantly reduced the number of time steps (down to 1) for +improved latency and energy efficiency, however, they target only convolutional +neural networks (CNN). These algorithms, when applied on the recently +spotlighted vision transformers (ViT), either require a large number of time +steps or fail to converge. Based on analysis of the histograms of the ANN and +SNN activation maps, we hypothesize that each ViT block has a different +sensitivity to the number of time steps. We propose a novel training framework +that dynamically allocates the number of time steps to each ViT module +depending on a trainable score assigned to each timestep. In particular, we +generate a scalar binary time step mask that filters spikes emitted by each +neuron in a leaky-integrate-and-fire (LIF) layer. The resulting SNNs have high +activation sparsity and require only accumulate operations (AC), except for the +input embedding layer, in contrast to expensive multiply-and-accumulates (MAC) +needed in traditional ViTs. This yields significant improvements in energy +efficiency. We evaluate our training framework and resulting SNNs on image +recognition tasks including CIFAR10, CIFAR100, and ImageNet with different ViT +architectures. We obtain a test accuracy of 95.97% with 4.97 time steps with +direct encoding on CIFAR10. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Typhoon Intensity Prediction with Vision Transformer NeurIPS 2023 + + +
+ Predicting typhoon intensity accurately across space and time is crucial for +issuing timely disaster warnings and facilitating emergency response. This has +vast potential for minimizing life losses and property damages as well as +reducing economic and environmental impacts. Leveraging satellite imagery for +scenario analysis is effective but also introduces additional challenges due to +the complex relations among clouds and the highly dynamic context. Existing +deep learning methods in this domain rely on convolutional neural networks +(CNNs), which suffer from limited per-layer receptive fields. This limitation +hinders their ability to capture long-range dependencies and global contextual +knowledge during inference. In response, we introduce a novel approach, namely +"Typhoon Intensity Transformer" (Tint), which leverages self-attention +mechanisms with global receptive fields per layer. Tint adopts a +sequence-to-sequence feature representation learning perspective. It begins by +cutting a given satellite image into a sequence of patches and recursively +employs self-attention operations to extract both local and global contextual +relations between all patch pairs simultaneously, thereby enhancing per-patch +feature representation learning. Extensive experiments on a publicly available +typhoon benchmark validate the efficacy of Tint in comparison with both +state-of-the-art deep learning and conventional meteorological methods. Our +code is available at https://github.com/chen-huanxin/Tint. + +
+
+ comment: 8 pages, 2 figures, accepted by Tackling Climate Change with Machine + Learning: workshop at NeurIPS 2023 +
+
+
+
+
+ + ☆ TopoSemiSeg: Enforcing Topological Consistency for Semi-Supervised + Segmentation of Histopathology Images + + +
+ In computational pathology, segmenting densely distributed objects like +glands and nuclei is crucial for downstream analysis. To alleviate the burden +of obtaining pixel-wise annotations, semi-supervised learning methods learn +from large amounts of unlabeled data. Nevertheless, existing semi-supervised +methods overlook the topological information hidden in the unlabeled images and +are thus prone to topological errors, e.g., missing or incorrectly +merged/separated glands or nuclei. To address this issue, we propose +TopoSemiSeg, the first semi-supervised method that learns the topological +representation from unlabeled data. In particular, we propose a topology-aware +teacher-student approach in which the teacher and student networks learn shared +topological representations. To achieve this, we introduce topological +consistency loss, which contains signal consistency and noise removal losses to +ensure the learned representation is robust and focuses on true topological +signals. Extensive experiments on public pathology image datasets show the +superiority of our method, especially on topology-wise evaluation metrics. Code +is available at https://github.com/Melon-Xu/TopoSemiSeg. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Centre Stage: Centricity-based Audio-Visual Temporal Action Detection BMVC 2023 + + +
+ Previous one-stage action detection approaches have modelled temporal +dependencies using only the visual modality. In this paper, we explore +different strategies to incorporate the audio modality, using multi-scale +cross-attention to fuse the two modalities. We also demonstrate the correlation +between the distance from the timestep to the action centre and the accuracy of +the predicted boundaries. Thus, we propose a novel network head to estimate the +closeness of timesteps to the action centre, which we call the centricity +score. This leads to increased confidence for proposals that exhibit more +precise boundaries. Our method can be integrated with other one-stage +anchor-free architectures and we demonstrate this on three recent baselines on +the EPIC-Kitchens-100 action detection benchmark where we achieve +state-of-the-art performance. Detailed ablation studies showcase the benefits +of fusing audio and our proposed centricity scores. Code and models for our +proposed method are publicly available at +https://github.com/hanielwang/Audio-Visual-TAD.git + +
+
+ comment: Accepted to VUA workshop at BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Brain Diffusion for Visual Exploration: Cortical Discovery using Large + Scale Generative Models NeurIPS 2023 + + +
+ A long standing goal in neuroscience has been to elucidate the functional +organization of the brain. Within higher visual cortex, functional accounts +have remained relatively coarse, focusing on regions of interest (ROIs) and +taking the form of selectivity for broad categories such as faces, places, +bodies, food, or words. Because the identification of such ROIs has typically +relied on manually assembled stimulus sets consisting of isolated objects in +non-ecological contexts, exploring functional organization without robust a +priori hypotheses has been challenging. To overcome these limitations, we +introduce a data-driven approach in which we synthesize images predicted to +activate a given brain region using paired natural images and fMRI recordings, +bypassing the need for category-specific stimuli. Our approach -- Brain +Diffusion for Visual Exploration ("BrainDiVE") -- builds on recent generative +methods by combining large-scale diffusion models with brain-guided image +synthesis. Validating our method, we demonstrate the ability to synthesize +preferred images with appropriate semantic specificity for well-characterized +category-selective ROIs. We then show that BrainDiVE can characterize +differences between ROIs selective for the same high-level category. Finally we +identify novel functional subdivisions within these ROIs, validated with +behavioral data. These results advance our understanding of the fine-grained +functional organization of human visual cortex, and provide well-specified +constraints for further examination of cortical organization using +hypothesis-driven methods. + +
+
+ comment: NeurIPS 2023 (Oral). Project page: + https://www.cs.cmu.edu/~afluo/BrainDiVE/ +
+
+
+
+
+ + ♻ ☆ Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating + Video-based Large Language Models + + +
+ Video-based large language models (Video-LLMs) have been recently introduced, +targeting both fundamental improvements in perception and comprehension, and a +diverse range of user inquiries. In pursuit of the ultimate goal of achieving +artificial general intelligence, a truly intelligent Video-LLM model should not +only see and understand the surroundings, but also possess human-level +commonsense, and make well-informed decisions for the users. To guide the +development of such a model, the establishment of a robust and comprehensive +evaluation system becomes crucial. To this end, this paper proposes +\textit{Video-Bench}, a new comprehensive benchmark along with a toolkit +specifically designed for evaluating Video-LLMs. The benchmark comprises 10 +meticulously crafted tasks, evaluating the capabilities of Video-LLMs across +three distinct levels: Video-exclusive Understanding, Prior Knowledge-based +Question-Answering, and Comprehension and Decision-making. In addition, we +introduce an automatic toolkit tailored to process model outputs for various +tasks, facilitating the calculation of metrics and generating convenient final +scores. We evaluate 8 representative Video-LLMs using \textit{Video-Bench}. The +findings reveal that current Video-LLMs still fall considerably short of +achieving human-like comprehension and analysis of real-world videos, offering +valuable insights for future research directions. The benchmark and toolkit are +available at: \url{https://github.com/PKU-YuanGroup/Video-Bench}. + +
+
+ comment: Benchmark is available at + https://github.com/PKU-YuanGroup/Video-Bench +
+
+
+
+
+ + ♻ ☆ A Tale of Two Features: Stable Diffusion Complements DINO for Zero-Shot + Semantic Correspondence NeurIPS 23 + + +
+ Text-to-image diffusion models have made significant advances in generating +and editing high-quality images. As a result, numerous approaches have explored +the ability of diffusion model features to understand and process single images +for downstream tasks, e.g., classification, semantic segmentation, and +stylization. However, significantly less is known about what these features +reveal across multiple, different images and objects. In this work, we exploit +Stable Diffusion (SD) features for semantic and dense correspondence and +discover that with simple post-processing, SD features can perform +quantitatively similar to SOTA representations. Interestingly, the qualitative +analysis reveals that SD features have very different properties compared to +existing representation learning features, such as the recently released +DINOv2: while DINOv2 provides sparse but accurate matches, SD features provide +high-quality spatial information but sometimes inaccurate semantic matches. We +demonstrate that a simple fusion of these two features works surprisingly well, +and a zero-shot evaluation using nearest neighbors on these fused features +provides a significant performance gain over state-of-the-art methods on +benchmark datasets, e.g., SPair-71k, PF-Pascal, and TSS. We also show that +these correspondences can enable interesting applications such as instance +swapping in two images. + +
+
+ comment: Accepted by NeurIPS 23, project page: + https://sd-complements-dino.github.io/ +
+
+
+
+
+ + ♻ ☆ Diffusion Models for Interferometric Satellite Aperture Radar + + +
+ Probabilistic Diffusion Models (PDMs) have recently emerged as a very +promising class of generative models, achieving high performance in natural +image generation. However, their performance relative to non-natural images, +like radar-based satellite data, remains largely unknown. Generating large +amounts of synthetic (and especially labelled) satellite data is crucial to +implement deep-learning approaches for the processing and analysis of +(interferometric) satellite aperture radar data. Here, we leverage PDMs to +generate several radar-based satellite image datasets. We show that PDMs +succeed in generating images with complex and realistic structures, but that +sampling time remains an issue. Indeed, accelerated sampling strategies, which +work well on simple image datasets like MNIST, fail on our radar datasets. We +provide a simple and versatile open-source +https://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and +evaluate PDMs using any dataset on a single GPU. + +
+
+
+
+
+ + ♻ ☆ Exploiting Causality Signals in Medical Images: A Pilot Study with + Empirical Results + + +
+ We present a novel technique to discover and exploit weak causal signals +directly from images via neural networks for classification purposes. This way, +we model how the presence of a feature in one part of the image affects the +appearance of another feature in a different part of the image. Our method +consists of a convolutional neural network backbone and a causality-factors +extractor module, which computes weights to enhance each feature map according +to its causal influence in the scene. We developed different architecture +variants and empirically evaluated all of our models on two public datasets of +prostate MRI images and breast histopathology slides for cancer diagnosis. To +confirm our quantitative results, we conduct ablation studies and investigate +the explainability of our models via class activation maps. Our findings show +that our lightweight block extracts meaningful information and improves the +overall classification, together with producing more robust predictions that +focus on relevant parts of the image. That is crucial in medical imaging, where +accurate and reliable classifications are essential for effective diagnosis and +treatment planning. + +
+
+ comment: Repeated analyses with new dataset, provided more visual/algorithmic + insights, improved clarity, remarked significance and novelty; 17 pages, 8 + figures, second round review +
+
+
+
+
+ + ♻ ☆ Defining the boundaries: challenges and advances in identifying cells in + microscopy images + + +
+ Segmentation, or the outlining of objects within images, is a critical step +in the measurement and analysis of cells within microscopy images. While +improvements continue to be made in tools that rely on classical methods for +segmentation, deep learning-based tools increasingly dominate advances in the +technology. Specialist models such as Cellpose continue to improve in accuracy +and user-friendliness, and segmentation challenges such as the Multi-Modality +Cell Segmentation Challenge continue to push innovation in accuracy across +widely-varying test data as well as efficiency and usability. Increased +attention on documentation, sharing, and evaluation standards are leading to +increased user-friendliness and acceleration towards the goal of a truly +universal method. + +
+
+ comment: 12 pages, 1 figure, submitted to "Current Opinion in Biotechnology" +
+
+
+
+
+ + ♻ ☆ Exploring Semantic Attributes from A Foundation Model for Federated + Learning of Disjoint Label Spaces + + +
+ Conventional centralised deep learning paradigms are not feasible when data +from different sources cannot be shared due to data privacy or transmission +limitation. To resolve this problem, federated learning has been introduced to +transfer knowledge across multiple sources (clients) with non-shared data while +optimising a globally generalised central model (server). Existing federated +learning paradigms mostly focus on transferring holistic high-level knowledge +(such as class) across models, which are closely related to specific objects of +interest so may suffer from inverse attack. In contrast, in this work, we +consider transferring mid-level semantic knowledge (such as attribute) which is +not sensitive to specific objects of interest and therefore is more +privacy-preserving and scalable. To this end, we formulate a new Federated +Zero-Shot Learning (FZSL) paradigm to learn mid-level semantic knowledge at +multiple local clients with non-shared local data and cumulatively aggregate a +globally generalised central model for deployment. To improve model +discriminative ability, we propose to explore semantic knowledge augmentation +from external knowledge for enriching the mid-level semantic space in FZSL. +Extensive experiments on five zeroshot learning benchmark datasets validate the +effectiveness of our approach for optimising a generalisable federated learning +model with mid-level semantic knowledge transfer. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ 360Roam: Real-Time Indoor Roaming Using Geometry-Aware 360$^\circ$ + Radiance Fields + + +
+ Virtual tour among sparse 360$^\circ$ images is widely used while hindering +smooth and immersive roaming experiences. The emergence of Neural Radiance +Field (NeRF) has showcased significant progress in synthesizing novel views, +unlocking the potential for immersive scene exploration. Nevertheless, previous +NeRF works primarily focused on object-centric scenarios, resulting in +noticeable performance degradation when applied to outward-facing and +large-scale scenes due to limitations in scene parameterization. To achieve +seamless and real-time indoor roaming, we propose a novel approach using +geometry-aware radiance fields with adaptively assigned local radiance fields. +Initially, we employ multiple 360$^\circ$ images of an indoor scene to +progressively reconstruct explicit geometry in the form of a probabilistic +occupancy map, derived from a global omnidirectional radiance field. +Subsequently, we assign local radiance fields through an adaptive +divide-and-conquer strategy based on the recovered geometry. By incorporating +geometry-aware sampling and decomposition of the global radiance field, our +system effectively utilizes positional encoding and compact neural networks to +enhance rendering quality and speed. Additionally, the extracted floorplan of +the scene aids in providing visual guidance, contributing to a realistic +roaming experience. To demonstrate the effectiveness of our system, we curated +a diverse dataset of 360$^\circ$ images encompassing various real-life scenes, +on which we conducted extensive experiments. Quantitative and qualitative +comparisons against baseline approaches illustrated the superior performance of +our system in large-scale indoor scene roaming. + +
+
+
+
+
+ + ♻ ☆ Towards Attributions of Input Variables in a Coalition + + +
+ This paper aims to develop a new attribution method to explain the conflict +between individual variables' attributions and their coalition's attribution +from a fully new perspective. First, we find that the Shapley value can be +reformulated as the allocation of Harsanyi interactions encoded by the AI +model. Second, based the re-alloction of interactions, we extend the Shapley +value to the attribution of coalitions. Third we ective. We derive the +fundamental mechanism behind the conflict. This conflict come from the +interaction containing partial variables in their coalition. + +
+
+
+
+
+ + ♻ ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models + + +
+ The popularity of pre-trained large models has revolutionized downstream +tasks across diverse fields, such as language, vision, and multi-modality. To +minimize the adaption cost for downstream tasks, many Parameter-Efficient +Fine-Tuning (PEFT) techniques are proposed for language and 2D image +pre-trained models. However, the specialized PEFT method for 3D pre-trained +models is still under-explored. To this end, we introduce Point-PEFT, a novel +framework for adapting point cloud pre-trained models with minimal learnable +parameters. Specifically, for a pre-trained 3D model, we freeze most of its +parameters, and only tune the newly added PEFT modules on downstream tasks, +which consist of a Point-prior Prompt and a Geometry-aware Adapter. The +Point-prior Prompt adopts a set of learnable prompt tokens, for which we +propose to construct a memory bank with domain-specific knowledge, and utilize +a parameter-free attention to enhance the prompt tokens. The Geometry-aware +Adapter aims to aggregate point cloud features within spatial neighborhoods to +capture fine-grained geometric information through local interactions. +Extensive experiments indicate that our Point-PEFT can achieve better +performance than the full fine-tuning on various downstream tasks, while using +only 5% of the trainable parameters, demonstrating the efficiency and +effectiveness of our approach. Code will be released at +https://github.com/Even-JK/PEFT-3D. + +
+
+ comment: 10 pages. The specialized PEFT framework for 3D pre-trained models, + which achieves competitive performance to full fine-tuning, and significantly + reduces the computational resources. Project page: + https://github.com/Even-JK/PEFT-3D +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ FeTrIL: Feature Translation for Exemplar-Free Class-Incremental Learning + + +
+ Exemplar-free class-incremental learning is very challenging due to the +negative effect of catastrophic forgetting. A balance between stability and +plasticity of the incremental process is needed in order to obtain good +accuracy for past as well as new classes. Existing exemplar-free +class-incremental methods focus either on successive fine tuning of the model, +thus favoring plasticity, or on using a feature extractor fixed after the +initial incremental state, thus favoring stability. We introduce a method which +combines a fixed feature extractor and a pseudo-features generator to improve +the stability-plasticity balance. The generator uses a simple yet effective +geometric translation of new class features to create representations of past +classes, made of pseudo-features. The translation of features only requires the +storage of the centroid representations of past classes to produce their +pseudo-features. Actual features of new classes and pseudo-features of past +classes are fed into a linear classifier which is trained incrementally to +discriminate between all classes. The incremental process is much faster with +the proposed method compared to mainstream ones which update the entire deep +model. Experiments are performed with three challenging datasets, and different +incremental settings. A comparison with ten existing methods shows that our +method outperforms the others in most cases. + +
+
+
+
+
+ + ♻ ☆ BakedAvatar: Baking Neural Fields for Real-Time Head Avatar Synthesis SIGGRAPH + + +
+ Synthesizing photorealistic 4D human head avatars from videos is essential +for VR/AR, telepresence, and video game applications. Although existing Neural +Radiance Fields (NeRF)-based methods achieve high-fidelity results, the +computational expense limits their use in real-time applications. To overcome +this limitation, we introduce BakedAvatar, a novel representation for real-time +neural head avatar synthesis, deployable in a standard polygon rasterization +pipeline. Our approach extracts deformable multi-layer meshes from learned +isosurfaces of the head and computes expression-, pose-, and view-dependent +appearances that can be baked into static textures for efficient rasterization. +We thus propose a three-stage pipeline for neural head avatar synthesis, which +includes learning continuous deformation, manifold, and radiance fields, +extracting layered meshes and textures, and fine-tuning texture details with +differential rasterization. Experimental results demonstrate that our +representation generates synthesis results of comparable quality to other +state-of-the-art methods while significantly reducing the inference time +required. We further showcase various head avatar synthesis results from +monocular videos, including view synthesis, face reenactment, expression +editing, and pose editing, all at interactive frame rates. + +
+
+ comment: ACM Transactions on Graphics (SIGGRAPH Asia 2023). Project Page: + https://buaavrcg.github.io/BakedAvatar +
+
+
+
+
+ + ♻ ☆ Continuously Controllable Facial Expression Editing in Talking Face + Videos + + +
+ Recently audio-driven talking face video generation has attracted +considerable attention. However, very few researches address the issue of +emotional editing of these talking face videos with continuously controllable +expressions, which is a strong demand in the industry. The challenge is that +speech-related expressions and emotion-related expressions are often highly +coupled. Meanwhile, traditional image-to-image translation methods cannot work +well in our application due to the coupling of expressions with other +attributes such as poses, i.e., translating the expression of the character in +each frame may simultaneously change the head pose due to the bias of the +training data distribution. In this paper, we propose a high-quality facial +expression editing method for talking face videos, allowing the user to control +the target emotion in the edited video continuously. We present a new +perspective for this task as a special case of motion information editing, +where we use a 3DMM to capture major facial movements and an associated texture +map modeled by a StyleGAN to capture appearance details. Both representations +(3DMM and texture map) contain emotional information and can be continuously +modified by neural networks and easily smoothed by averaging in +coefficient/latent spaces, making our method simple yet effective. We also +introduce a mouth shape preservation loss to control the trade-off between lip +synchronization and the degree of exaggeration of the edited expression. +Extensive experiments and a user study show that our method achieves +state-of-the-art performance across various evaluation criteria. + +
+
+ comment: Accepted by IEEE Transactions on Affective Computing (DOI: + 10.1109/TAFFC.2023.3334511). Demo video: https://youtu.be/WD-bNVya6kM . + Project page: https://raineggplant.github.io/FEE4TV +
+
+
+
+
+ + ♻ ☆ Proximal Algorithms for Accelerated Langevin Dynamics + + +
+ We develop a novel class of MCMC algorithms based on a stochastized Nesterov +scheme. With an appropriate addition of noise, the result is a +time-inhomogeneous underdamped Langevin equation, which we prove emits a +specified target distribution as its invariant measure. Convergence rates to +stationarity under Wasserstein-2 distance are established as well. +Metropolis-adjusted and stochastic gradient versions of the proposed Langevin +dynamics are also provided. Experimental illustrations show superior +performance of the proposed method over typical Langevin samplers for different +models in statistics and image processing including better mixing of the +resulting Markov chains. + +
+
+ comment: The technical proofs for the paper will be revised +
+
+
+
+
+ + ♻ ☆ High-performance real-world optical computing trained by in situ + model-free optimization + + +
+ Optical computing systems provide high-speed and low-energy data processing +but face deficiencies in computationally demanding training and +simulation-to-reality gaps. We propose a model-free optimization (MFO) method +based on a score gradient estimation algorithm for computationally efficient in +situ training of optical computing systems. This approach treats an optical +computing system as a black box and back-propagates the loss directly to the +optical computing weights' probability distributions, circumventing the need +for a computationally heavy and biased system simulation. Our experiments on a +single-layer diffractive optical computing system show that MFO outperforms +hybrid training on the MNIST and FMNIST datasets. Furthermore, we demonstrate +image-free and high-speed classification of cells from their phase maps. Our +method's model-free and high-performance nature, combined with its low demand +for computational resources, expedites the transition of optical computing from +laboratory demonstrations to real-world applications. + +
+
+
+
+
+ + ♻ ☆ Perceptual Image Compression with Cooperative Cross-Modal Side + Information + + +
+ The explosion of data has resulted in more and more associated text being +transmitted along with images. Inspired by from distributed source coding, many +works utilize image side information to enhance image compression. However, +existing methods generally do not consider using text as side information to +enhance perceptual compression of images, even though the benefits of +multimodal synergy have been widely demonstrated in research. This begs the +following question: How can we effectively transfer text-level semantic +dependencies to help image compression, which is only available to the decoder? +In this work, we propose a novel deep image compression method with text-guided +side information to achieve a better rate-perception-distortion tradeoff. +Specifically, we employ the CLIP text encoder and an effective Semantic-Spatial +Aware block to fuse the text and image features. This is done by predicting a +semantic mask to guide the learned text-adaptive affine transformation at the +pixel level. Furthermore, we design a text-conditional generative adversarial +networks to improve the perceptual quality of reconstructed images. Extensive +experiments involving four datasets and ten image quality assessment metrics +demonstrate that the proposed approach achieves superior results in terms of +rate-perception trade-off and semantic distortion. + +
+
+
+
+
+ + ♻ ☆ DomainStudio: Fine-Tuning Diffusion Models for Domain-Driven Image + Generation using Limited Data + + +
+ Denoising diffusion probabilistic models (DDPMs) have been proven capable of +synthesizing high-quality images with remarkable diversity when trained on +large amounts of data. Typical diffusion models and modern large-scale +conditional generative models like text-to-image generative models are +vulnerable to overfitting when fine-tuned on extremely limited data. Existing +works have explored subject-driven generation using a reference set containing +a few images. However, few prior works explore DDPM-based domain-driven +generation, which aims to learn the common features of target domains while +maintaining diversity. This paper proposes a novel DomainStudio approach to +adapt DDPMs pre-trained on large-scale source datasets to target domains using +limited data. It is designed to keep the diversity of subjects provided by +source domains and get high-quality and diverse adapted samples in target +domains. We propose to keep the relative distances between adapted samples to +achieve considerable generation diversity. In addition, we further enhance the +learning of high-frequency details for better generation quality. Our approach +is compatible with both unconditional and conditional diffusion models. This +work makes the first attempt to realize unconditional few-shot image generation +with diffusion models, achieving better quality and greater diversity than +current state-of-the-art GAN-based approaches. Moreover, this work also +significantly relieves overfitting for conditional generation and realizes +high-quality domain-driven generation, further expanding the applicable +scenarios of modern large-scale text-to-image models. + +
+
+ comment: extended from DDPM-PA (arXiv:2211.03264), 33 pages, 34 figures +
+
+
+
+
+ + ♻ ☆ Progressive Learning with Visual Prompt Tuning for Variable-Rate Image + Compression + + +
+ In this paper, we propose a progressive learning paradigm for +transformer-based variable-rate image compression. Our approach covers a wide +range of compression rates with the assistance of the Layer-adaptive Prompt +Module (LPM). Inspired by visual prompt tuning, we use LPM to extract prompts +for input images and hidden features at the encoder side and decoder side, +respectively, which are fed as additional information into the Swin Transformer +layer of a pre-trained transformer-based image compression model to affect the +allocation of attention region and the bits, which in turn changes the target +compression ratio of the model. To ensure the network is more lightweight, we +involves the integration of prompt networks with less convolutional layers. +Exhaustive experiments show that compared to methods based on multiple models, +which are optimized separately for different target rates, the proposed method +arrives at the same performance with 80% savings in parameter storage and 90% +savings in datasets. Meanwhile, our model outperforms all current variable +bitrate image methods in terms of rate-distortion performance and approaches +the state-of-the-art fixed bitrate image compression methods trained from +scratch. + +
+
+
+
+
+ + ♻ ☆ Generation Of Colors using Bidirectional Long Short Term Memory Networks + + +
+ Human vision can distinguish between a vast spectrum of colours, estimated to +be between 2 to 7 million discernible shades. However, this impressive range +does not inherently imply that all these colours have been precisely named and +described within our lexicon. We often associate colours with familiar objects +and concepts in our daily lives. This research endeavors to bridge the gap +between our visual perception of countless shades and our ability to articulate +and name them accurately. A novel model has been developed to achieve this +goal, leveraging Bidirectional Long Short-Term Memory (BiLSTM) networks with +Active learning. This model operates on a proprietary dataset meticulously +curated for this study. The primary objective of this research is to create a +versatile tool for categorizing and naming previously unnamed colours or +identifying intermediate shades that elude traditional colour terminology. The +findings underscore the potential of this innovative approach in +revolutionizing our understanding of colour perception and language. Through +rigorous experimentation and analysis, this study illuminates a promising +avenue for Natural Language Processing (NLP) applications in diverse +industries. By facilitating the exploration of the vast colour spectrum the +potential applications of NLP are extended beyond conventional boundaries. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation + + +
+ Current AI-based methods do not provide comprehensible physical +interpretations of the utilized data, extracted features, and +predictions/inference operations. As a result, deep learning models trained +using high-resolution satellite imagery lack transparency and explainability +and can be merely seen as a black box, which limits their wide-level adoption. +Experts need help understanding the complex behavior of AI models and the +underlying decision-making process. The explainable artificial intelligence +(XAI) field is an emerging field providing means for robust, practical, and +trustworthy deployment of AI models. Several XAI techniques have been proposed +for image classification tasks, whereas the interpretation of image +segmentation remains largely unexplored. This paper offers to bridge this gap +by adapting the recent XAI classification algorithms and making them usable for +muti-class image segmentation, where we mainly focus on buildings' segmentation +from high-resolution satellite images. To benchmark and compare the performance +of the proposed approaches, we introduce a new XAI evaluation methodology and +metric based on "Entropy" to measure the model uncertainty. Conventional XAI +evaluation methods rely mainly on feeding area-of-interest regions from the +image back to the pre-trained (utility) model and then calculating the average +change in the probability of the target class. Those evaluation metrics lack +the needed robustness, and we show that using Entropy to monitor the model +uncertainty in segmenting the pixels within the target class is more suitable. +We hope this work will pave the way for additional XAI research for image +segmentation and applications in the remote sensing discipline. + +
+
+
+
+
+ + ♻ ☆ CLIP-DIY: CLIP Dense Inference Yields Open-Vocabulary Semantic + Segmentation For-Free WACV 2024 + + +
+ The emergence of CLIP has opened the way for open-world image perception. The +zero-shot classification capabilities of the model are impressive but are +harder to use for dense tasks such as image segmentation. Several methods have +proposed different modifications and learning schemes to produce dense output. +Instead, we propose in this work an open-vocabulary semantic segmentation +method, dubbed CLIP-DIY, which does not require any additional training or +annotations, but instead leverages existing unsupervised object localization +approaches. In particular, CLIP-DIY is a multi-scale approach that directly +exploits CLIP classification abilities on patches of different sizes and +aggregates the decision in a single map. We further guide the segmentation +using foreground/background scores obtained using unsupervised object +localization methods. With our method, we obtain state-of-the-art zero-shot +semantic segmentation results on PASCAL VOC and perform on par with the best +methods on COCO. The code is available at +http://github.com/wysoczanska/clip-diy + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ Event-Free Moving Object Segmentation from Moving Ego Vehicle + + +
+ Moving object segmentation (MOS) in dynamic scenes is challenging for +autonomous driving, especially for sequences obtained from moving ego vehicles. +Most state-of-the-art methods leverage motion cues obtained from optical flow +maps. However, since these methods are often based on optical flows that are +pre-computed from successive RGB frames, this neglects the temporal +consideration of events occurring within inter-frame and limits the +practicality of these methods in real-life situations. To address these +limitations, we propose to exploit event cameras for better video +understanding, which provide rich motion cues without relying on optical flow. +To foster research in this area, we first introduce a novel large-scale dataset +called DSEC-MOS for moving object segmentation from moving ego vehicles. +Subsequently, we devise EmoFormer, a novel network able to exploit the event +data. For this purpose, we fuse the event prior with spatial semantic maps to +distinguish moving objects from the static background, adding another level of +dense supervision around our object of interest - moving ones. Our proposed +network relies only on event data for training but does not require event input +during inference, making it directly comparable to frame-only methods in terms +of efficiency and more widely usable in many application cases. An exhaustive +comparison with 8 state-of-the-art video object segmentation methods highlights +a significant performance improvement of our method over all other methods. +Project Page: https://github.com/ZZY-Zhou/DSEC-MOS. + +
+
+
+
+
+ + ♻ ☆ Unleashing the Potential of Spiking Neural Networks by Dynamic + Confidence ICCV2023 + + +
+ This paper presents a new methodology to alleviate the fundamental trade-off +between accuracy and latency in spiking neural networks (SNNs). The approach +involves decoding confidence information over time from the SNN outputs and +using it to develop a decision-making agent that can dynamically determine when +to terminate each inference. + The proposed method, Dynamic Confidence, provides several significant +benefits to SNNs. 1. It can effectively optimize latency dynamically at +runtime, setting it apart from many existing low-latency SNN algorithms. Our +experiments on CIFAR-10 and ImageNet datasets have demonstrated an average 40% +speedup across eight different settings after applying Dynamic Confidence. 2. +The decision-making agent in Dynamic Confidence is straightforward to construct +and highly robust in parameter space, making it extremely easy to implement. 3. +The proposed method enables visualizing the potential of any given SNN, which +sets a target for current SNNs to approach. For instance, if an SNN can +terminate at the most appropriate time point for each input sample, a ResNet-50 +SNN can achieve an accuracy as high as 82.47% on ImageNet within just 4.71 time +steps on average. Unlocking the potential of SNNs needs a highly-reliable +decision-making agent to be constructed and fed with a high-quality estimation +of ground truth. In this regard, Dynamic Confidence represents a meaningful +step toward realizing the potential of SNNs. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ ID-like Prompt Learning for Few-Shot Out-of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection methods often exploit auxiliary outliers +to train model identifying OOD samples, especially discovering challenging +outliers from auxiliary outliers dataset to improve OOD detection. However, +they may still face limitations in effectively distinguishing between the most +challenging OOD samples that are much like in-distribution (ID) data, i.e., +ID-like samples. To this end, we propose a novel OOD detection framework that +discovers ID-like outliers using CLIP from the vicinity space of the ID +samples, thus helping to identify these most challenging OOD samples. Then a +prompt learning framework is proposed that utilizes the identified ID-like +outliers to further leverage the capabilities of CLIP for OOD detection. +Benefiting from the powerful CLIP, we only need a small number of ID samples to +learn the prompts of the model without exposing other auxiliary outlier +datasets. By focusing on the most challenging ID-like OOD samples and elegantly +exploiting the capabilities of CLIP, our method achieves superior few-shot +learning performance on various real-world image datasets (e.g., in 4-shot OOD +detection on the ImageNet-1k dataset, our method reduces the average FPR95 by +12.16% and improves the average AUROC by 2.76%, compared to state-of-the-art +methods). + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ CompenHR: Efficient Full Compensation for High-resolution Projector + + +
+ Full projector compensation is a practical task of projector-camera systems. +It aims to find a projector input image, named compensation image, such that +when projected it cancels the geometric and photometric distortions due to the +physical environment and hardware. State-of-the-art methods use deep learning +to address this problem and show promising performance for low-resolution +setups. However, directly applying deep learning to high-resolution setups is +impractical due to the long training time and high memory cost. To address this +issue, this paper proposes a practical full compensation solution. Firstly, we +design an attention-based grid refinement network to improve geometric +correction quality. Secondly, we integrate a novel sampling scheme into an +end-to-end compensation network to alleviate computation and introduce +attention blocks to preserve key features. Finally, we construct a benchmark +dataset for high-resolution projector full compensation. In experiments, our +method demonstrates clear advantages in both efficiency and quality. + +
+
+
+
+
+ + ♻ ☆ Development and evaluation of automated localisation and reconstruction + of all fruits on tomato plants in a greenhouse based on multi-view perception + and 3D multi-object tracking + + +
+ The ability to accurately represent and localise relevant objects is +essential for robots to carry out tasks effectively. Traditional approaches, +where robots simply capture an image, process that image to take an action, and +then forget the information, have proven to struggle in the presence of +occlusions. Methods using multi-view perception, which have the potential to +address some of these problems, require a world model that guides the +collection, integration and extraction of information from multiple viewpoints. +Furthermore, constructing a generic representation that can be applied in +various environments and tasks is a difficult challenge. In this paper, a novel +approach for building generic representations in occluded agro-food +environments using multi-view perception and 3D multi-object tracking is +introduced. The method is based on a detection algorithm that generates partial +point clouds for each detected object, followed by a 3D multi-object tracking +algorithm that updates the representation over time. The accuracy of the +representation was evaluated in a real-world environment, where successful +representation and localisation of tomatoes in tomato plants were achieved, +despite high levels of occlusion, with the total count of tomatoes estimated +with a maximum error of 5.08% and the tomatoes tracked with an accuracy up to +71.47%. Novel tracking metrics were introduced, demonstrating that valuable +insight into the errors in localising and representing the fruits can be +provided by their use. This approach presents a novel solution for building +representations in occluded agro-food environments, demonstrating potential to +enable robots to perform tasks effectively in these challenging environments. + +
+
+
+
+
+ + ♻ ☆ Mixed Hierarchy Network for Image Restoration + + +
+ Image restoration is a long-standing low-level vision problem, e.g., +deblurring and deraining. In the process of image restoration, it is necessary +to consider not only the spatial details and contextual information of +restoration to ensure the quality, but also the system complexity. Although +many methods have been able to guarantee the quality of image restoration, the +system complexity of the state-of-the-art (SOTA) methods is increasing as well. +Motivated by this, we present a mixed hierarchy network that can balance these +competing goals. Our main proposal is a mixed hierarchy architecture, that +progressively recovers contextual information and spatial details from degraded +images while we design intra-blocks to reduce system complexity. Specifically, +our model first learns the contextual information using encoder-decoder +architectures, and then combines them with high-resolution branches that +preserve spatial detail. In order to reduce the system complexity of this +architecture for convenient analysis and comparison, we replace or remove the +nonlinear activation function with multiplication and use a simple network +structure. In addition, we replace spatial convolution with global +self-attention for the middle block of encoder-decoder. The resulting tightly +interlinked hierarchy architecture, named as MHNet, delivers strong performance +gains on several image restoration tasks, including image deraining, and +deblurring. + +
+
+
+
+
+ + ♻ ☆ Uncertainty Aware AI for 2D MRI Segmentation + + +
+ Robust uncertainty estimations are necessary in safety-critical applications +of Deep Learning. One such example is the semantic segmentation of medical +images, whilst deep-learning approaches have high performance in such tasks +they lack interpretability as they give no indication of their confidence when +making classification decisions. Robust and interpretable segmentation is a +critical first stage in automatically screening for pathologies hence the +optimal solution is one which can provide high accuracy but also capture the +underlying uncertainty. In this work, we present an uncertainty-aware +segmentation model, BA U-Net, for use on MRI data that incorporates Bayesian +Neural Networks and Attention Mechanisms to provide accurate and interpretable +segmentations. We evaluated our model on the publicly available BraTS 2020 +dataset using F1 Score and Intersection Over Union (IoU) as evaluation metrics. + +
+
+ comment: 14 Pages, 9 Figures Updated to Correct Typos, Revise Title +
+
+
+
+
+ + ♻ ☆ Towards End-to-End Embodied Decision Making via Multi-modal Large + Language Model: Explorations with GPT4-Vision and Beyond MDM + + +
+ In this study, we explore the potential of Multimodal Large Language Models +(MLLMs) in improving embodied decision-making processes for agents. While Large +Language Models (LLMs) have been widely used due to their advanced reasoning +skills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual +understanding and reasoning capabilities. We investigate whether +state-of-the-art MLLMs can handle embodied decision-making in an end-to-end +manner and whether collaborations between LLMs and MLLMs can enhance +decision-making. To address these questions, we introduce a new benchmark +called PCA-EVAL, which evaluates embodied decision-making from the perspectives +of Perception, Cognition, and Action. Additionally, we propose HOLMES, a +multi-agent cooperation framework that allows LLMs to leverage MLLMs and APIs +to gather multimodal information for informed decision-making. We compare +end-to-end embodied decision-making and HOLMES on our benchmark and find that +the GPT4-Vision model demonstrates strong end-to-end embodied decision-making +abilities, outperforming GPT4-HOLMES in terms of average decision accuracy +(+3%). However, this performance is exclusive to the latest GPT4-Vision model, +surpassing the open-source state-of-the-art MLLM by 26%. Our results indicate +that powerful MLLMs like GPT4-Vision hold promise for decision-making in +embodied agents, offering new avenues for MLLM research. Code and data are open +at https://github.com/pkunlp-icler/PCA-EVAL/. + +
+
+ comment: FMDM@NeurIPS2023, Code and data: + https://github.com/pkunlp-icler/PCA-EVAL/ +
+
+
+
+
+ + ♻ ☆ Segmentation of diagnostic tissue compartments on whole slide images + with renal thrombotic microangiopathies (TMAs) + + +
+ The thrombotic microangiopathies (TMAs) manifest in renal biopsy histology +with a broad spectrum of acute and chronic findings. Precise diagnostic +criteria for a renal biopsy diagnosis of TMA are missing. As a first step +towards a machine learning- and computer vision-based analysis of wholes slide +images from renal biopsies, we trained a segmentation model for the decisive +diagnostic kidney tissue compartments artery, arteriole, glomerulus on a set of +whole slide images from renal biopsies with TMAs and Mimickers (distinct +diseases with a similar nephropathological appearance as TMA like severe benign +nephrosclerosis, various vasculitides, Bevacizumab-plug glomerulopathy, +arteriolar light chain deposition disease). Our segmentation model combines a +U-Net-based tissue detection with a Shifted windows-transformer architecture to +reach excellent segmentation results for even the most severely altered +glomeruli, arterioles and arteries, even on unseen staining domains from a +different nephropathology lab. With accurate automatic segmentation of the +decisive renal biopsy compartments in human renal vasculopathies, we have laid +the foundation for large-scale compartment-specific machine learning and +computer vision analysis of renal biopsy repositories with TMAs. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Segment Anything in 3D with NeRFs NeurIPS 2023 + + +
+ Recently, the Segment Anything Model (SAM) emerged as a powerful vision +foundation model which is capable to segment anything in 2D images. This paper +aims to generalize SAM to segment 3D objects. Rather than replicating the data +acquisition and annotation procedure which is costly in 3D, we design an +efficient solution, leveraging the Neural Radiance Field (NeRF) as a cheap and +off-the-shelf prior that connects multi-view 2D images to the 3D space. We +refer to the proposed solution as SA3D, for Segment Anything in 3D. It is only +required to provide a manual segmentation prompt (e.g., rough points) for the +target object in a single view, which is used to generate its 2D mask in this +view with SAM. Next, SA3D alternately performs mask inverse rendering and +cross-view self-prompting across various views to iteratively complete the 3D +mask of the target object constructed with voxel grids. The former projects the +2D mask obtained by SAM in the current view onto 3D mask with guidance of the +density distribution learned by the NeRF; The latter extracts reliable prompts +automatically as the input to SAM from the NeRF-rendered 2D mask in another +view. We show in experiments that SA3D adapts to various scenes and achieves 3D +segmentation within minutes. Our research reveals a potential methodology to +lift the ability of a 2D vision foundation model to 3D, as long as the 2D model +can steadily address promptable segmentation across multiple views. Our code is +available at https://github.com/Jumpat/SegmentAnythingin3D. + +
+
+ comment: NeurIPS 2023. Project page: https://jumpat.github.io/SA3D/ +
+
+
+
+
+ + ♻ ☆ On the Road with GPT-4V(ision): Early Explorations of Visual-Language + Model on Autonomous Driving + + +
+ The pursuit of autonomous driving technology hinges on the sophisticated +integration of perception, decision-making, and control systems. Traditional +approaches, both data-driven and rule-based, have been hindered by their +inability to grasp the nuance of complex driving environments and the +intentions of other road users. This has been a significant bottleneck, +particularly in the development of common sense reasoning and nuanced scene +understanding necessary for safe and reliable autonomous driving. The advent of +Visual Language Models (VLM) represents a novel frontier in realizing fully +autonomous vehicle driving. This report provides an exhaustive evaluation of +the latest state-of-the-art VLM, GPT-4V(ision), and its application in +autonomous driving scenarios. We explore the model's abilities to understand +and reason about driving scenes, make decisions, and ultimately act in the +capacity of a driver. Our comprehensive tests span from basic scene recognition +to complex causal reasoning and real-time decision-making under varying +conditions. Our findings reveal that GPT-4V demonstrates superior performance +in scene understanding and causal reasoning compared to existing autonomous +systems. It showcases the potential to handle out-of-distribution scenarios, +recognize intentions, and make informed decisions in real driving contexts. +However, challenges remain, particularly in direction discernment, traffic +light recognition, vision grounding, and spatial reasoning tasks. These +limitations underscore the need for further research and development. Project +is now available on GitHub for interested parties to access and utilize: +\url{https://github.com/PJLab-ADG/GPT4V-AD-Exploration} + +
+
+
+
+
+ + ♻ ☆ Zero-shot Visual Relation Detection via Composite Visual Cues from Large + Language Models + + +
+ Pretrained vision-language models, such as CLIP, have demonstrated strong +generalization capabilities, making them promising tools in the realm of +zero-shot visual recognition. Visual relation detection (VRD) is a typical task +that identifies relationship (or interaction) types between object pairs within +an image. However, naively utilizing CLIP with prevalent class-based prompts +for zero-shot VRD has several weaknesses, e.g., it struggles to distinguish +between different fine-grained relation types and it neglects essential spatial +information of two objects. To this end, we propose a novel method for +zero-shot VRD: RECODE, which solves RElation detection via COmposite +DEscription prompts. Specifically, RECODE first decomposes each predicate +category into subject, object, and spatial components. Then, it leverages large +language models (LLMs) to generate description-based prompts (or visual cues) +for each component. Different visual cues enhance the discriminability of +similar relation categories from different perspectives, which significantly +boosts performance in VRD. To dynamically fuse different cues, we further +introduce a chain-of-thought method that prompts LLMs to generate reasonable +weights for different visual cues. Extensive experiments on four VRD benchmarks +have demonstrated the effectiveness and interpretability of RECODE. + +
+
+
+
+
+ + ♻ ☆ PRIS: Practical robust invertible network for image steganography + + +
+ Image steganography is a technique of hiding secret information inside +another image, so that the secret is not visible to human eyes and can be +recovered when needed. Most of the existing image steganography methods have +low hiding robustness when the container images affected by distortion. Such as +Gaussian noise and lossy compression. This paper proposed PRIS to improve the +robustness of image steganography, it based on invertible neural networks, and +put two enhance modules before and after the extraction process with a 3-step +training strategy. Moreover, rounding error is considered which is always +ignored by existing methods, but actually it is unavoidable in practical. A +gradient approximation function (GAF) is also proposed to overcome the +undifferentiable issue of rounding distortion. Experimental results show that +our PRIS outperforms the state-of-the-art robust image steganography method in +both robustness and practicability. Codes are available at +https://github.com/yanghangAI/PRIS, demonstration of our model in practical at +http://yanghang.site/hide/. + +
+
+
+
+
+ + ♻ ☆ CapST: An Enhanced and Lightweight Model Attribution Approach for + Synthetic Videos + + +
+ Deepfake videos, generated through AI faceswapping techniques, have garnered +considerable attention due to their potential for powerful impersonation +attacks. While existing research primarily focuses on binary classification to +discern between real and fake videos, however determining the specific +generation model for a fake video is crucial for forensic investigation. +Addressing this gap, this paper investigates the model attribution problem of +Deepfake videos from a recently proposed dataset, Deepfakes from Different +Models (DFDM), derived from various Autoencoder models. The dataset comprises +6,450 Deepfake videos generated by five distinct models with variations in +encoder, decoder, intermediate layer, input resolution, and compression ratio. +This study formulates Deepfakes model attribution as a multiclass +classification task, proposing a segment of VGG19 as a feature extraction +backbone, known for its effectiveness in imagerelated tasks, while integrated a +Capsule Network with a Spatio-Temporal attention mechanism. The Capsule module +captures intricate hierarchies among features for robust identification of +deepfake attributes. Additionally, the video-level fusion technique leverages +temporal attention mechanisms to handle concatenated feature vectors, +capitalizing on inherent temporal dependencies in deepfake videos. By +aggregating insights across frames, our model gains a comprehensive +understanding of video content, resulting in more precise predictions. +Experimental results on the deepfake benchmark dataset (DFDM) demonstrate the +efficacy of our proposed method, achieving up to a 4% improvement in accurately +categorizing deepfake videos compared to baseline models while demanding fewer +computational resources. + +
+
+
+
+
+ + ♻ ☆ Deep Planar Parallax for Monocular Depth Estimation + + +
+ Recent research has highlighted the utility of Planar Parallax Geometry in +monocular depth estimation. However, its potential has yet to be fully realized +because networks rely heavily on appearance for depth prediction. Our in-depth +analysis reveals that utilizing flow-pretrain can optimize the network's usage +of consecutive frame modeling, leading to substantial performance enhancement. +Additionally, we propose Planar Position Embedding (PPE) to handle dynamic +objects that defy static scene assumptions and to tackle slope variations that +are challenging to differentiate. Comprehensive experiments on autonomous +driving datasets, namely KITTI and the Waymo Open Dataset (WOD), prove that our +Planar Parallax Network (PPNet) significantly surpasses existing learning-based +methods in performance. + +
+
+
+
+
+ + ♻ ☆ Towards Discriminative Representation with Meta-learning for + Colonoscopic Polyp Re-Identification + + +
+ Colonoscopic Polyp Re-Identification aims to match the same polyp from a +large gallery with images from different views taken using different cameras +and plays an important role in the prevention and treatment of colorectal +cancer in computer-aided diagnosis. However, traditional methods for object +ReID directly adopting CNN models trained on the ImageNet dataset usually +produce unsatisfactory retrieval performance on colonoscopic datasets due to +the large domain gap. Additionally, these methods neglect to explore the +potential of self-discrepancy among intra-class relations in the colonoscopic +polyp dataset, which remains an open research problem in the medical community. +To solve this dilemma, we propose a simple but effective training method named +Colo-ReID, which can help our model learn more general and discriminative +knowledge based on the meta-learning strategy in scenarios with fewer samples. +Based on this, a dynamic Meta-Learning Regulation mechanism called MLR is +introduced to further boost the performance of polyp re-identification. To the +best of our knowledge, this is the first attempt to leverage the meta-learning +paradigm instead of traditional machine learning algorithm to effectively train +deep models in the task of colonoscopic polyp re-identification. Empirical +results show that our method significantly outperforms current state-of-the-art +methods by a clear margin. + +
+
+
+
+
+ + ♻ ☆ ShareGPT4V: Improving Large Multi-Modal Models with Better Captions + + +
+ In the realm of large multi-modal models (LMMs), efficient modality alignment +is crucial yet often constrained by the scarcity of high-quality image-text +data. To address this bottleneck, we introduce the ShareGPT4V dataset, a +pioneering large-scale resource featuring 1.2 million highly descriptive +captions, which surpasses existing datasets in diversity and information +content, covering world knowledge, object properties, spatial relationships, +and aesthetic evaluations. Specifically, ShareGPT4V originates from a curated +100K high-quality captions collected from advanced GPT4-Vision and has been +expanded to 1.2M with a superb caption model trained on this subset. ShareGPT4V +first demonstrates its effectiveness for the Supervised Fine-Tuning (SFT) +phase, by substituting an equivalent quantity of detailed captions in existing +SFT datasets with a subset of our high-quality captions, significantly +enhancing the LMMs like LLaVA-7B, LLaVA-1.5-13B, and Qwen-VL-Chat-7B on the MME +and MMBench benchmarks, with respective gains of 222.8/22.0/22.3 and +2.7/1.3/1.5. We further incorporate ShareGPT4V data into both the pre-training +and SFT phases, obtaining ShareGPT4V-7B, a superior LMM based on a simple +architecture that has remarkable performance across a majority of the +multi-modal benchmarks. This project is available at +https://ShareGPT4V.github.io to serve as a pivotal resource for advancing the +LMMs community. + +
+
+ comment: Project: https://ShareGPT4V.github.io +
+
+
+
+
+ + ♻ ☆ COVID-19 detection using ViT transformer-based approach from Computed + Tomography Images + + +
+ In here, we introduce a novel approach to enhance the accuracy and efficiency +of COVID-19 diagnosis using CT images. Leveraging state-of-the-art Transformer +models in computer vision, we employed the base ViT Transformer configured for +224x224-sized input images, modifying the output to suit the binary +classification task. Notably, input images were resized from the standard CT +scan size of 512x512 to match the model's expectations. Our method implements a +systematic patient-level prediction strategy, classifying individual CT slices +as COVID-19 or non-COVID. To determine the overall diagnosis for each patient, +a majority voting approach as well as other thresholding approaches were +employed. This method involves evaluating all CT slices for a given patient and +assigning the patient the diagnosis that relates to the thresholding for the CT +scan. This meticulous patient-level prediction process contributes to the +robustness of our solution as it starts from 2D-slices to 3D-patient level. +Throughout the evaluation process, our approach resulted in 0.7 macro F1 score +on the COV19-CT -DB validation set. To ensure the reliability and effectiveness +of our model, we rigorously validate it on the extensive COV-19 CT dataset, +which is meticulously annotated for the task. This dataset, with its +comprehensive annotations, reinforces the overall robustness of our solution. + +
+
+
+
+
+ + ♻ ☆ Enhanced Synthetic MRI Generation from CT Scans Using CycleGAN with + Feature Extraction + + +
+ In the field of radiotherapy, accurate imaging and image registration are of +utmost importance for precise treatment planning. Magnetic Resonance Imaging +(MRI) offers detailed imaging without being invasive and excels in soft-tissue +contrast, making it a preferred modality for radiotherapy planning. However, +the high cost of MRI, longer acquisition time, and certain health +considerations for patients pose challenges. Conversely, Computed Tomography +(CT) scans offer a quicker and less expensive imaging solution. To bridge these +modalities and address multimodal alignment challenges, we introduce an +approach for enhanced monomodal registration using synthetic MRI images. +Utilizing unpaired data, this paper proposes a novel method to produce these +synthetic MRI images from CT scans, leveraging CycleGANs and feature +extractors. By building upon the foundational work on Cycle-Consistent +Adversarial Networks and incorporating advancements from related literature, +our methodology shows promising results, outperforming several state-of-the-art +methods. The efficacy of our approach is validated by multiple comparison +metrics. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ SemanticBoost: Elevating Motion Generation with Augmented Textual Cues + + +
+ Current techniques face difficulties in generating motions from intricate +semantic descriptions, primarily due to insufficient semantic annotations in +datasets and weak contextual understanding. To address these issues, we present +SemanticBoost, a novel framework that tackles both challenges simultaneously. +Our framework comprises a Semantic Enhancement module and a Context-Attuned +Motion Denoiser (CAMD). The Semantic Enhancement module extracts supplementary +semantics from motion data, enriching the dataset's textual description and +ensuring precise alignment between text and motion data without depending on +large language models. On the other hand, the CAMD approach provides an +all-encompassing solution for generating high-quality, semantically consistent +motion sequences by effectively capturing context information and aligning the +generated motion with the given textual descriptions. Distinct from existing +methods, our approach can synthesize accurate orientational movements, combined +motions based on specific body part descriptions, and motions generated from +complex, extended sentences. Our experimental results demonstrate that +SemanticBoost, as a diffusion-based method, outperforms auto-regressive-based +techniques, achieving cutting-edge performance on the Humanml3D dataset while +maintaining realistic and smooth motion generation quality. + +
+
+
+
+
+ + ♻ ☆ MM-NeRF: Multimodal-Guided 3D Multi-Style Transfer of Neural Radiance + Field + + +
+ 3D style transfer aims to generate stylized views of 3D scenes with specified +styles, which requires high-quality generating and keeping multi-view +consistency. Existing methods still suffer the challenges of high-quality +stylization with texture details and stylization with multimodal guidance. In +this paper, we reveal that the common training method of stylization with NeRF, +which generates stylized multi-view supervision by 2D style transfer models, +causes the same object in supervision to show various states (color tone, +details, etc.) in different views, leading NeRF to tend to smooth the texture +details, further resulting in low-quality rendering for 3D multi-style +transfer. To tackle these problems, we propose a novel Multimodal-guided 3D +Multi-style transfer of NeRF, termed MM-NeRF. First, MM-NeRF projects +multimodal guidance into a unified space to keep the multimodal styles +consistency and extracts multimodal features to guide the 3D stylization. +Second, a novel multi-head learning scheme is proposed to relieve the +difficulty of learning multi-style transfer, and a multi-view style consistent +loss is proposed to track the inconsistency of multi-view supervision data. +Finally, a novel incremental learning mechanism to generalize MM-NeRF to any +new style with small costs. Extensive experiments on several real-world +datasets show that MM-NeRF achieves high-quality 3D multi-style stylization +with multimodal guidance, and keeps multi-view consistency and style +consistency between multimodal guidance. Codes will be released. + +
+
+
+
+
+ + ♻ ☆ FIXED: Frustratingly Easy Domain Generalization with Mixup + + +
+ Domain generalization (DG) aims to learn a generalizable model from multiple +training domains such that it can perform well on unseen target domains. A +popular strategy is to augment training data to benefit generalization through +methods such as Mixup~\cite{zhang2018mixup}. While the vanilla Mixup can be +directly applied, theoretical and empirical investigations uncover several +shortcomings that limit its performance. Firstly, Mixup cannot effectively +identify the domain and class information that can be used for learning +invariant representations. Secondly, Mixup may introduce synthetic noisy data +points via random interpolation, which lowers its discrimination capability. +Based on the analysis, we propose a simple yet effective enhancement for +Mixup-based DG, namely domain-invariant Feature mIXup (FIX). It learns +domain-invariant representations for Mixup. To further enhance discrimination, +we leverage existing techniques to enlarge margins among classes to further +propose the domain-invariant Feature MIXup with Enhanced Discrimination (FIXED) +approach. We present theoretical insights about guarantees on its +effectiveness. Extensive experiments on seven public datasets across two +modalities including image classification (Digits-DG, PACS, Office-Home) and +time series (DSADS, PAMAP2, UCI-HAR, and USC-HAD) demonstrate that our approach +significantly outperforms nine state-of-the-art related methods, beating the +best performing baseline by 6.5\% on average in terms of test accuracy. Code is +available at: +https://github.com/jindongwang/transferlearning/tree/master/code/deep/fixed. + +
+
+ comment: First Conference on Parsimony and Learning (CPAL) 2024; code for DG + at: https://github.com/jindongwang/transferlearning/tree/master/code/DeepDG +
+
+
+
+
+ + ♻ ☆ HiFA: High-fidelity Text-to-3D Generation with Advanced Diffusion + Guidance + + +
+ The advancements in automatic text-to-3D generation have been remarkable. +Most existing methods use pre-trained text-to-image diffusion models to +optimize 3D representations like Neural Radiance Fields (NeRFs) via +latent-space denoising score matching. Yet, these methods often result in +artifacts and inconsistencies across different views due to their suboptimal +optimization approaches and limited understanding of 3D geometry. Moreover, the +inherent constraints of NeRFs in rendering crisp geometry and stable textures +usually lead to a two-stage optimization to attain high-resolution details. +This work proposes holistic sampling and smoothing approaches to achieve +high-quality text-to-3D generation, all in a single-stage optimization. We +compute denoising scores in the text-to-image diffusion model's latent and +image spaces. Instead of randomly sampling timesteps (also referred to as noise +levels in denoising score matching), we introduce a novel timestep annealing +approach that progressively reduces the sampled timestep throughout +optimization. To generate high-quality renderings in a single-stage +optimization, we propose regularization for the variance of z-coordinates along +NeRF rays. To address texture flickering issues in NeRFs, we introduce a kernel +smoothing technique that refines importance sampling weights coarse-to-fine, +ensuring accurate and thorough sampling in high-density regions. Extensive +experiments demonstrate the superiority of our method over previous approaches, +enabling the generation of highly detailed and view-consistent 3D assets +through a single-stage training process. + +
+
+ comment: Project page: https://hifa-team.github.io/HiFA-site/ +
+
+
+
+
+ + ♻ ☆ Hierarchical Relationships: A New Perspective to Enhance Scene Graph + Generation NeurIPS 2023 + + +
+ This paper presents a finding that leveraging the hierarchical structures +among labels for relationships and objects can substantially improve the +performance of scene graph generation systems. The focus of this work is to +create an informative hierarchical structure that can divide object and +relationship categories into disjoint super-categories in a systematic way. +Specifically, we introduce a Bayesian prediction head to jointly predict the +super-category of relationships between a pair of object instances, as well as +the detailed relationship within that super-category simultaneously, +facilitating more informative predictions. The resulting model exhibits the +capability to produce a more extensive set of predicates beyond the dataset +annotations, and to tackle the prevalent issue of low annotation quality. While +our paper presents preliminary findings, experiments on the Visual Genome +dataset show its strong performance, particularly in predicate classifications +and zero-shot settings, that demonstrates the promise of our approach. + +
+
+ comment: NeurIPS 2023 New Frontiers in Graph Learning Workshop (NeurIPS + GLFrontiers 2023); NeurIPS 2023 Queer in AI Workshop +
+
+
+
+
+ + ♻ ☆ GraSS: Contrastive Learning with Gradient Guided Sampling Strategy for + Remote Sensing Image Semantic Segmentation + + +
+ Self-supervised contrastive learning (SSCL) has achieved significant +milestones in remote sensing image (RSI) understanding. Its essence lies in +designing an unsupervised instance discrimination pretext task to extract image +features from a large number of unlabeled images that are beneficial for +downstream tasks. However, existing instance discrimination based SSCL suffer +from two limitations when applied to the RSI semantic segmentation task: 1) +Positive sample confounding issue; 2) Feature adaptation bias. It introduces a +feature adaptation bias when applied to semantic segmentation tasks that +require pixel-level or object-level features. In this study, We observed that +the discrimination information can be mapped to specific regions in RSI through +the gradient of unsupervised contrastive loss, these specific regions tend to +contain singular ground objects. Based on this, we propose contrastive learning +with Gradient guided Sampling Strategy (GraSS) for RSI semantic segmentation. +GraSS consists of two stages: Instance Discrimination warm-up (ID warm-up) and +Gradient guided Sampling contrastive training (GS training). The ID warm-up +aims to provide initial discrimination information to the contrastive loss +gradients. The GS training stage aims to utilize the discrimination information +contained in the contrastive loss gradients and adaptively select regions in +RSI patches that contain more singular ground objects, in order to construct +new positive and negative samples. Experimental results on three open datasets +demonstrate that GraSS effectively enhances the performance of SSCL in +high-resolution RSI semantic segmentation. Compared to seven baseline methods +from five different types of SSCL, GraSS achieves an average improvement of +1.57\% and a maximum improvement of 3.58\% in terms of mean intersection over +the union. The source code is available at https://github.com/GeoX-Lab/GraSS + +
+
+ comment: 14 pages, 10 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Improving Image Captioning via Predicting Structured Concepts EMNLP 2023 + + +
+ Having the difficulty of solving the semantic gap between images and texts +for the image captioning task, conventional studies in this area paid some +attention to treating semantic concepts as a bridge between the two modalities +and improved captioning performance accordingly. Although promising results on +concept prediction were obtained, the aforementioned studies normally ignore +the relationship among concepts, which relies on not only objects in the image, +but also word dependencies in the text, so that offers a considerable potential +for improving the process of generating good descriptions. In this paper, we +propose a structured concept predictor (SCP) to predict concepts and their +structures, then we integrate them into captioning, so as to enhance the +contribution of visual signals in this task via concepts and further use their +relations to distinguish cross-modal semantics for better description +generation. Particularly, we design weighted graph convolutional networks +(W-GCN) to depict concept relations driven by word dependencies, and then +learns differentiated contributions from these concepts for following decoding +process. Therefore, our approach captures potential relations among concepts +and discriminatively learns different concepts, so that effectively facilitates +image captioning with inherited information across modalities. Extensive +experiments and their results demonstrate the effectiveness of our approach as +well as each proposed module in this work. + +
+
+ comment: Accepted by EMNLP 2023 (Main Conference, Oral) +
+
+
+
+
+ + ♻ ☆ Monocular Camera Localization for Automated Vehicles Using Image + Retrieval + + +
+ We address the problem of finding the current position and heading angle of +an autonomous vehicle in real-time using a single camera. Compared to methods +which require LiDARs and high definition (HD) 3D maps in real-time, the +proposed approach is easily scalable and computationally efficient, at the +price of lower precision. + The new method combines and adapts existing algorithms in three different +fields: image retrieval, mapping database, and particle filtering. The result +is a simple, real-time localization method using an image retrieval method +whose performance is comparable to other monocular camera localization methods +which use a map built with LiDARs. + We evaluate the proposed method using the KITTI odometry dataset and via +closed-loop experiments with an indoor 1:10 autonomous vehicle. The tests +demonstrate real-time capability and a 10cm level accuracy. Also, experimental +results of the closed-loop indoor tests show the presence of a positive +feedback loop between the localization error and the control error. Such +phenomena is analysed in details at the end of the article. + +
+
+
+
+
+ + ♻ ☆ On the Performance of Multimodal Language Models + + +
+ Instruction-tuned large language models (LLMs) have demonstrated promising +zero-shot generalization capabilities across various downstream tasks. Recent +research has introduced multimodal capabilities to LLMs by integrating +independently pretrained vision encoders through model grafting. These +multimodal variants undergo instruction tuning, similar to LLMs, enabling +effective zero-shot generalization for multimodal tasks. This study conducts a +comparative analysis of different multimodal instruction tuning approaches and +evaluates their performance across a range of tasks, including complex +reasoning, conversation, image captioning, multiple-choice questions (MCQs), +and binary classification. Through rigorous benchmarking and ablation +experiments, we reveal key insights for guiding architectural choices when +incorporating multimodal capabilities into LLMs. However, current approaches +have limitations; they do not sufficiently address the need for a diverse +multimodal instruction dataset, which is crucial for enhancing task +generalization. Additionally, they overlook issues related to truthfulness and +factuality when generating responses. These findings illuminate current +methodological constraints in adapting language models for image comprehension +and provide valuable guidance for researchers and practitioners seeking to +harness multimodal versions of LLMs. + +
+
+
+
+
+ + ♻ ☆ FedSOL: Stabilized Orthogonal Learning in Federated Learning + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +client data distributions are heterogeneous. Many previous FL algorithms have +addressed this issue by introducing various proximal restrictions. These +restrictions aim to encourage global alignment by constraining the deviation of +local learning from the global objective. However, they inherently limit local +learning by interfering with the original local objectives. Recently, an +alternative approach has emerged to improve local learning generality. By +obtaining local models within a smooth loss landscape, this approach mitigates +conflicts among different local objectives of the clients. Yet, it does not +ensure stable global alignment, as local learning does not take the global +objective into account. In this study, we propose Federated Stability on +Learning (FedSoL), which combines both the concepts of global alignment and +local generality. In FedSoL, the local learning seeks a parameter region robust +against proximal perturbations. This strategy introduces an implicit proximal +restriction effect in local learning while maintaining the original local +objective for parameter update. Our experiments show that FedSoL consistently +achieves state-of-the-art performance on various setups. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ A-JEPA: Joint-Embedding Predictive Architecture Can Listen + + +
+ This paper presents that the masked-modeling principle driving the success of +large foundational vision models can be effectively applied to audio by making +predictions in a latent space. We introduce Audio-based Joint-Embedding +Predictive Architecture (A-JEPA), a simple extension method for self-supervised +learning from the audio spectrum. Following the design of I-JEPA, our A-JEPA +encodes visible audio spectrogram patches with a curriculum masking strategy +via context encoder, and predicts the representations of regions sampled at +well-designed locations. The target representations of those regions are +extracted by the exponential moving average of context encoder, \emph{i.e.}, +target encoder, on the whole spectrogram. We find it beneficial to transfer +random block masking into time-frequency aware masking in a curriculum manner, +considering the complexity of highly correlated in local time and frequency in +audio spectrograms. To enhance contextual semantic understanding and +robustness, we fine-tune the encoder with a regularized masking on target +datasets, instead of input dropping or zero. Empirically, when built with +Vision Transformers structure, we find A-JEPA to be highly scalable and sets +new state-of-the-art performance on multiple audio and speech classification +tasks, outperforming other recent models that use externally supervised +pre-training. + +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Enhancing Item-level Bundle Representation for Bundle Recommendation + + +
+ Bundle recommendation approaches offer users a set of related items on a +particular topic. The current state-of-the-art (SOTA) method utilizes +contrastive learning to learn representations at both the bundle and item +levels. However, due to the inherent difference between the bundle-level and +item-level preferences, the item-level representations may not receive +sufficient information from the bundle affiliations to make accurate +predictions. In this paper, we propose a novel approach EBRec, short of +Enhanced Bundle Recommendation, which incorporates two enhanced modules to +explore inherent item-level bundle representations. First, we propose to +incorporate the bundle-user-item (B-U-I) high-order correlations to explore +more collaborative information, thus to enhance the previous bundle +representation that solely relies on the bundle-item affiliation information. +Second, we further enhance the B-U-I correlations by augmenting the observed +user-item interactions with interactions generated from pre-trained models, +thus improving the item-level bundle representations. We conduct extensive +experiments on three public datasets, and the results justify the effectiveness +of our approach as well as the two core modules. Codes and datasets are +available at https://github.com/answermycode/EBRec. + +
+
+
+
+
+ + ☆ Temporal Importance Factor for Loss Functions for CTR Prediction + + +
+ Click-through rate (CTR) prediction is an important task for the companies to +recommend products which better match user preferences. User behavior in +digital advertising is dynamic and changes over time. It is crucial for the +companies to capture the most recent trends to provide more accurate +recommendations for users. In CTR prediction, most models use binary +cross-entropy loss function. However, it does not focus on the data +distribution shifts occurring over time. To address this problem, we propose a +factor for the loss functions by utilizing the sequential nature of user-item +interactions. This approach aims to focus on the most recent samples by +penalizing them more through the loss function without forgetting the long-term +information. Our solution is model-agnostic, and the temporal importance factor +can be used with different loss functions. Offline experiments in both public +and company datasets show that the temporal importance factor for loss +functions outperforms the baseline loss functions considered. + +
+
+
+
+
+ + ☆ MultiCBR: Multi-view Contrastive Learning for Bundle Recommendation + + +
+ Bundle recommendation seeks to recommend a bundle of related items to users +to improve both user experience and the profits of platform. Existing bundle +recommendation models have progressed from capturing only user-bundle +interactions to the modeling of multiple relations among users, bundles and +items. CrossCBR, in particular, incorporates cross-view contrastive learning +into a two-view preference learning framework, significantly improving SOTA +performance. It does, however, have two limitations: 1) the two-view +formulation does not fully exploit all the heterogeneous relations among users, +bundles and items; and 2) the "early contrast and late fusion" framework is +less effective in capturing user preference and difficult to generalize to +multiple views. In this paper, we present MultiCBR, a novel Multi-view +Contrastive learning framework for Bundle Recommendation. First, we devise a +multi-view representation learning framework capable of capturing all the +user-bundle, user-item and bundle-item relations, especially better utilizing +the bundle-item affiliations to enhance sparse bundles' representations. +Second, we innovatively adopt an "early fusion and late contrast" design that +first fuses the multi-view representations before performing self-supervised +contrastive learning. In comparison to existing approaches, our framework +reverses the order of fusion and contrast, introducing the following +advantages: 1)our framework is capable of modeling both cross-view and ego-view +preferences, allowing us to achieve enhanced user preference modeling; and 2) +instead of requiring quadratic number of cross-view contrastive losses, we only +require two self-supervised contrastive losses, resulting in minimal extra +costs. Experimental results on three public datasets indicate that our method +outperforms SOTA methods. + +
+
+
+
+
+ + ☆ RankingGPT: Empowering Large Language Models in Text Ranking with + Progressive Enhancement + + +
+ Text ranking is a critical task in various information retrieval +applications, and the recent success of Large Language Models (LLMs) in natural +language processing has sparked interest in their application to text ranking. +These methods primarily involve combining query and candidate documents and +leveraging prompt learning to determine query-document relevance using the +LLM's output probabilities for specific tokens or by directly generating a +ranked list of candidate documents. Although these approaches have demonstrated +promise, a noteworthy disparity arises between the training objective of LLMs, +which typically centers around next token prediction, and the objective of +evaluating query-document relevance. To address this gap and fully leverage LLM +potential in text ranking tasks, we propose a progressive multi-stage training +strategy. Firstly, we introduce a large-scale weakly supervised dataset of +relevance texts to enable the LLMs to acquire the ability to predict relevant +tokens without altering their original training objective. Subsequently, we +incorporate supervised training to further enhance LLM ranking capability. Our +experimental results on multiple benchmarks demonstrate the superior +performance of our proposed method compared to previous competitive approaches, +both in in-domain and out-of-domain scenarios. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Graph Pre-training and Prompt Learning for Recommendation + + +
+ GNN-based recommenders have excelled in modeling intricate user-item +interactions through multi-hop message passing. However, existing methods often +overlook the dynamic nature of evolving user-item interactions, which impedes +the adaption to changing user preferences and distribution shifts in newly +arriving data. Thus, their scalability and performances in real-world dynamic +environments are limited. In this study, we propose GraphPL, a framework that +incorporates parameter-efficient and dynamic graph pre-training with prompt +learning. This novel combination empowers GNNs to effectively capture both +long-term user preferences and short-term behavior dynamics, enabling the +delivery of accurate and timely recommendations. Our GraphPL framework +addresses the challenge of evolving user preferences by seamlessly integrating +a temporal prompt mechanism and a graph-structural prompt learning mechanism +into the pre-trained GNN model. The temporal prompt mechanism encodes time +information on user-item interaction, allowing the model to naturally capture +temporal context, while the graph-structural prompt learning mechanism enables +the transfer of pre-trained knowledge to adapt to behavior dynamics without the +need for continuous incremental training. We further bring in a dynamic +evaluation setting for recommendation to mimic real-world dynamic scenarios and +bridge the offline-online gap to a better level. Our extensive experiments +including a large-scale industrial deployment showcases the lightweight plug-in +scalability of our GraphPL when integrated with various state-of-the-art +recommenders, emphasizing the advantages of GraphPL in terms of effectiveness, +robustness and efficiency. + +
+
+
+
+
+ + ☆ Hyper-Relational Knowledge Graph Neural Network for Next POI + + +
+ With the advancement of mobile technology, Point of Interest (POI) +recommendation systems in Location-based Social Networks (LBSN) have brought +numerous benefits to both users and companies. Many existing works employ +Knowledge Graph (KG) to alleviate the data sparsity issue in LBSN. These +approaches primarily focus on modeling the pair-wise relations in LBSN to +enrich the semantics and thereby relieve the data sparsity issue. However, +existing approaches seldom consider the hyper-relations in LBSN, such as the +mobility relation (a 3-ary relation: user-POI-time). This makes the model hard +to exploit the semantics accurately. In addition, prior works overlook the rich +structural information inherent in KG, which consists of higher-order relations +and can further alleviate the impact of data sparsity.To this end, we propose a +Hyper-Relational Knowledge Graph Neural Network (HKGNN) model. In HKGNN, a +Hyper-Relational Knowledge Graph (HKG) that models the LBSN data is constructed +to maintain and exploit the rich semantics of hyper-relations. Then we proposed +a Hypergraph Neural Network to utilize the structural information of HKG in a +cohesive way. In addition, a self-attention network is used to leverage +sequential information and make personalized recommendations. Furthermore, side +information, essential in reducing data sparsity by providing background +knowledge of POIs, is not fully utilized in current methods. In light of this, +we extended the current dataset with available side information to further +lessen the impact of data sparsity. Results of experiments on four real-world +LBSN datasets demonstrate the effectiveness of our approach compared to +existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ l2Match: Optimization Techniques on Subgraph Matching Algorithm using + Label Pair, Neighboring Label Index, and Jump-Redo method + + +
+ Graph database is designed to store bidirectional relationships between +objects and facilitate the traversal process to extract a subgraph. However, +the subgraph matching process is an NP-Complete problem. Existing solutions to +this problem usually employ a filter-and-verification framework and a +divide-and-conquer method. The filter-and-verification framework minimizes the +number of inputs to the verification stage by filtering and pruning invalid +candidates as much as possible. Meanwhile, subgraph matching is performed on +the substructure decomposed from the larger graph to yield partial embedding. +Subsequently, the recursive traversal or set intersection technique combines +the partial embedding into a complete subgraph. In this paper, we first present +a comprehensive literature review of the state-of-the-art solutions. l2Match, a +subgraph isomorphism algorithm for small queries utilizing a Label-Pair Index +and filtering method, is then proposed and presented as a proof of concept. +Empirical experimentation shows that l2Match outperforms related +state-of-the-art solutions, and the proposed methods optimize the existing +algorithms. + +
+
+ comment: This short version of this article (6 pages) is accepted by ICEIC + 2024 +
+
+
+
+
+ + ☆ SARDINE: A Simulator for Automated Recommendation in Dynamic and + Interactive Environments + + +
+ Simulators can provide valuable insights for researchers and practitioners +who wish to improve recommender systems, because they allow one to easily tweak +the experimental setup in which recommender systems operate, and as a result +lower the cost of identifying general trends and uncovering novel findings +about the candidate methods. A key requirement to enable this accelerated +improvement cycle is that the simulator is able to span the various sources of +complexity that can be found in the real recommendation environment that it +simulates. + With the emergence of interactive and data-driven methods - e.g., +reinforcement learning or online and counterfactual learning-to-rank - that aim +to achieve user-related goals beyond the traditional accuracy-centric +objectives, adequate simulators are needed. In particular, such simulators must +model the various mechanisms that render the recommendation environment dynamic +and interactive, e.g., the effect of recommendations on the user or the effect +of biased data on subsequent iterations of the recommender system. We therefore +propose SARDINE, a flexible and interpretable recommendation simulator that can +help accelerate research in interactive and data-driven recommender systems. We +demonstrate its usefulness by studying existing methods within nine diverse +environments derived from SARDINE, and even uncover novel insights about them. + +
+
+
+
+
+ + ☆ ControlRec: Bridging the Semantic Gap between Language Model and + Personalized Recommendation + + +
+ The successful integration of large language models (LLMs) into +recommendation systems has proven to be a major breakthrough in recent studies, +paving the way for more generic and transferable recommendations. However, LLMs +struggle to effectively utilize user and item IDs, which are crucial +identifiers for successful recommendations. This is mainly due to their +distinct representation in a semantic space that is different from the natural +language (NL) typically used to train LLMs. To tackle such issue, we introduce +ControlRec, an innovative Contrastive prompt learning framework for +Recommendation systems. ControlRec treats user IDs and NL as heterogeneous +features and encodes them individually. To promote greater alignment and +integration between them in the semantic space, we have devised two auxiliary +contrastive objectives: (1) Heterogeneous Feature Matching (HFM) aligning item +description with the corresponding ID or user's next preferred ID based on +their interaction sequence, and (2) Instruction Contrastive Learning (ICL) +effectively merging these two crucial data sources by contrasting probability +distributions of output sequences generated by diverse tasks. Experimental +results on four public real-world datasets demonstrate the effectiveness of the +proposed method on improving model performance. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ UniIR: Training and Benchmarking Universal Multimodal Information + Retrievers + + +
+ Existing information retrieval (IR) models often assume a homogeneous format, +limiting their applicability to diverse user needs, such as searching for +images with text descriptions, searching for a news article with a headline +image, or finding a similar photo with a query image. To approach such +different information-seeking demands, we introduce UniIR, a unified +instruction-guided multimodal retriever capable of handling eight distinct +retrieval tasks across modalities. UniIR, a single retrieval system jointly +trained on ten diverse multimodal-IR datasets, interprets user instructions to +execute various retrieval tasks, demonstrating robust performance across +existing datasets and zero-shot generalization to new tasks. Our experiments +highlight that multi-task training and instruction tuning are keys to UniIR's +generalization ability. Additionally, we construct the M-BEIR, a multimodal +retrieval benchmark with comprehensive results, to standardize the evaluation +of universal multimodal information retrieval. + +
+
+ comment: Our code and dataset are available on this project page: + https://tiger-ai-lab.github.io/UniIR/ +
+
+
+
+
+ + ☆ ClimateX: Do LLMs Accurately Assess Human Expert Confidence in Climate + Statements? NeurIPS + 2023 + + +
+ Evaluating the accuracy of outputs generated by Large Language Models (LLMs) +is especially important in the climate science and policy domain. We introduce +the Expert Confidence in Climate Statements (ClimateX) dataset, a novel, +curated, expert-labeled dataset consisting of 8094 climate statements collected +from the latest Intergovernmental Panel on Climate Change (IPCC) reports, +labeled with their associated confidence levels. Using this dataset, we show +that recent LLMs can classify human expert confidence in climate-related +statements, especially in a few-shot learning setting, but with limited (up to +47%) accuracy. Overall, models exhibit consistent and significant +over-confidence on low and medium confidence statements. We highlight +implications of our results for climate communication, LLMs evaluation +strategies, and the use of LLMs in information retrieval systems. + +
+
+ comment: Tackling Climate Change with Machine Learning workshop at NeurIPS + 2023 +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks for Recommendation: Reproducibility, Graph + Topology, and Node Representation + + +
+ Graph neural networks (GNNs) have gained prominence in recommendation systems +in recent years. By representing the user-item matrix as a bipartite and +undirected graph, GNNs have demonstrated their potential to capture short- and +long-distance user-item interactions, thereby learning more accurate preference +patterns than traditional recommendation approaches. In contrast to previous +tutorials on the same topic, this tutorial aims to present and examine three +key aspects that characterize GNNs for recommendation: (i) the reproducibility +of state-of-the-art approaches, (ii) the potential impact of graph topological +characteristics on the performance of these models, and (iii) strategies for +learning node representations when training features from scratch or utilizing +pre-trained embeddings as additional item information (e.g., multimodal +features). The goal is to provide three novel theoretical and practical +perspectives on the field, currently subject to debate in graph learning but +long been overlooked in the context of recommendation systems. + +
+
+
+
+
+ + ♻ ☆ Adapting Large Language Models by Integrating Collaborative Semantics + for Recommendation + + +
+ Recently, large language models (LLMs) have shown great potential in +recommender systems, either improving existing recommendation models or serving +as the backbone. However, there exists a large semantic gap between LLMs and +recommender systems, since items to be recommended are often indexed by +discrete identifiers (item ID) out of the LLM's vocabulary. In essence, LLMs +capture language semantics while recommender systems imply collaborative +semantics, making it difficult to sufficiently leverage the model capacity of +LLMs for recommendation. To address this challenge, in this paper, we propose a +new LLM-based recommendation model called LC-Rec, which can better integrate +language and collaborative semantics for recommender systems. Our approach can +directly generate items from the entire item set for recommendation, without +relying on candidate items. Specifically, we make two major contributions in +our approach. For item indexing, we design a learning-based vector quantization +method with uniform semantic mapping, which can assign meaningful and +non-conflicting IDs (called item indices) for items. For alignment tuning, we +propose a series of specially designed tuning tasks to enhance the integration +of collaborative semantics in LLMs. Our fine-tuning tasks enforce LLMs to +deeply integrate language and collaborative semantics (characterized by the +learned item indices), so as to achieve an effective adaptation to recommender +systems. Extensive experiments demonstrate the effectiveness of our method, +showing that our approach can outperform a number of competitive baselines +including traditional recommenders and existing LLM-based recommenders. Our +code is available at https://github.com/RUCAIBox/LC-Rec/. + +
+
+
+
+
+ + ♻ ☆ Patent Documents to Engineering Design Knowledge Graphs + + +
+ Aimed at supporting knowledge-intensive tasks in the design process, +populating design knowledge from text documents involves the extraction of +triples - head entity :: relationship :: tail entity or h :: r :: t that could +be combined into a knowledge graph representation. As relationships are largely +chosen from ontological or common-sense alternatives, knowledge graphs built +using these depict an approximation or restricted view of design knowledge, +rather than what is explicated in text document. In this article, we present a +data-driven approach to identify and explicate facts (h :: r :: t) from +sentences in patent documents. We create a dataset of 44,227 sentences and +facts, encompassing all patent classifications while also capturing the +variations among patent document sections. Using this dataset, we train taggers +that classify tokens to: 1) identify all entities (h) and relationships (r) and +2) specific relationships (r) for a pair of entities (h :: ___ :: t). While +these taggers are built upon transformer-based sequence classification models, +we evaluate our proposed method against edge classification approaches that use +linear classifiers and graph neural networks, incorporating transformer-based +token embeddings and linguistic features. The simplicity and coverage of the +proposed method enable its application to patent documents at any scale and +variety. Upon deploying an open-source python package, we apply our method to +patent documents related to fan systems. From the knowledge graphs thus +extracted, we explain how facts could be generalised to domain ontologies as +well as be specified to subsystem levels. We also highlight the importance of +knowledge graph representations by retrieving and explicating the knowledge of +key issues in fan systems, while holding a comparative discussion against +opinions from ChatGPT. + +
+
+
+
+
+ + ♻ ☆ Hide Your Model: A Parameter Transmission-free Federated Recommender + System + + +
+ With the growing concerns regarding user data privacy, Federated Recommender +System (FedRec) has garnered significant attention recently due to its +privacy-preserving capabilities. Existing FedRecs generally adhere to a +learning protocol in which a central server shares a global recommendation +model with clients, and participants achieve collaborative learning by +frequently communicating the model's public parameters. Nevertheless, this +learning framework has two drawbacks that limit its practical usability: (1) It +necessitates a global-sharing recommendation model; however, in real-world +scenarios, information related to the recommender model, including its +algorithm and parameters, constitutes the platforms' intellectual property. +Hence, service providers are unlikely to release such information actively. (2) +The communication costs of model parameter transmission are expensive since the +model parameters are usually high-dimensional matrices. With the model size +increasing, the communication burden will be the bottleneck for such +traditional FedRecs. + Given the above limitations, this paper introduces a novel parameter +transmission-free federated recommendation framework that balances the +protection between users' data privacy and platforms' model privacy, namely +PTF-FedRec. Specifically, participants in PTF-FedRec collaboratively exchange +knowledge by sharing their predictions within a privacy-preserving mechanism. +Through this way, the central server can learn a recommender model without +disclosing its model parameters or accessing clients' raw data, preserving both +the server's model privacy and users' data privacy. Besides, since clients and +the central server only need to communicate prediction scores which are just a +few real numbers, the overhead is significantly reduced compared to traditional +FedRecs. + +
+
+
+
+
+ + ♻ ☆ Towards an Automatic AI Agent for Reaction Condition Recommendation in + Chemical Synthesis + + +
+ Artificial intelligence (AI) for reaction condition optimization has become +an important topic in the pharmaceutical industry, given that a data-driven AI +model can assist drug discovery and accelerate reaction design. However, +existing AI models lack the chemical insights and real-time knowledge +acquisition abilities of experienced human chemists. This paper proposes a +Large Language Model (LLM) empowered AI agent to bridge this gap. We put forth +a novel three-phase paradigm and applied advanced intelligence-enhancement +methods like in-context learning and multi-LLM debate so that the AI agent can +borrow human insight and update its knowledge by searching the latest chemical +literature. Additionally, we introduce a novel Coarse-label Contrastive +Learning (CCL) based chemical fingerprint that greatly enhances the agent's +performance in optimizing the reaction condition. With the above efforts, the +proposed AI agent can autonomously generate the optimal reaction condition +recommendation without any human interaction. Further, the agent is highly +professional in terms of chemical reactions. It demonstrates close-to-human +performance and strong generalization capability in both dry-lab and wet-lab +experiments. As the first attempt in the chemical AI agent, this work goes a +step further in the field of "AI for chemistry" and opens up new possibilities +for computer-aided synthesis planning. + +
+
+
+
+
+
+
+
+ + Machine Learning 170 + +
+
+
+ + ☆ Mission-driven Exploration for Accelerated Deep Reinforcement Learning + with Temporal Logic Task Specifications + + +
+ This paper addresses the problem of designing optimal control policies for +mobile robots with mission and safety requirements specified using Linear +Temporal Logic (LTL). We consider robots with unknown stochastic dynamics +operating in environments with unknown geometric structure. The robots are +equipped with sensors allowing them to detect obstacles. Our goal is to +synthesize a control policy that maximizes the probability of satisfying an +LTL-encoded task in the presence of motion and environmental uncertainty. +Several deep reinforcement learning (DRL) algorithms have been proposed +recently to address similar problems. A common limitation in related works is +that of slow learning performance. In order to address this issue, we propose a +novel DRL algorithm, which has the capability to learn control policies at a +notably faster rate compared to similar methods. Its sample efficiency is due +to a mission-driven exploration strategy that prioritizes exploration towards +directions that may contribute to mission accomplishment. Identifying these +directions relies on an automaton representation of the LTL task as well as a +learned neural network that (partially) models the unknown system dynamics. We +provide comparative experiments demonstrating the efficiency of our algorithm +on robot navigation tasks in unknown environments. + +
+
+
+
+
+ + ☆ No Representation Rules Them All in Category Discovery NeurIPS 2023 + + +
+ In this paper we tackle the problem of Generalized Category Discovery (GCD). +Specifically, given a dataset with labelled and unlabelled images, the task is +to cluster all images in the unlabelled subset, whether or not they belong to +the labelled categories. Our first contribution is to recognize that most +existing GCD benchmarks only contain labels for a single clustering of the +data, making it difficult to ascertain whether models are using the available +labels to solve the GCD task, or simply solving an unsupervised clustering +problem. As such, we present a synthetic dataset, named 'Clevr-4', for category +discovery. Clevr-4 contains four equally valid partitions of the data, i.e +based on object shape, texture, color or count. To solve the task, models are +required to extrapolate the taxonomy specified by the labelled set, rather than +simply latching onto a single natural grouping of the data. We use this dataset +to demonstrate the limitations of unsupervised clustering in the GCD setting, +showing that even very strong unsupervised models fail on Clevr-4. We further +use Clevr-4 to examine the weaknesses of existing GCD algorithms, and propose a +new method which addresses these shortcomings, leveraging consistent findings +from the representation learning literature to do so. Our simple solution, +which is based on 'mean teachers' and termed $\mu$GCD, substantially +outperforms implemented baselines on Clevr-4. Finally, when we transfer these +findings to real data on the challenging Semantic Shift Benchmark (SSB), we +find that $\mu$GCD outperforms all prior work, setting a new state-of-the-art. +For the project webpage, see https://www.robots.ox.ac.uk/~vgg/data/clevr4/ + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ DiffuseBot: Breeding Soft Robots With Physics-Augmented Generative + Diffusion Models NeurIPS 2023 + + +
+ Nature evolves creatures with a high complexity of morphological and +behavioral intelligence, meanwhile computational methods lag in approaching +that diversity and efficacy. Co-optimization of artificial creatures' +morphology and control in silico shows promise for applications in physical +soft robotics and virtual character creation; such approaches, however, require +developing new learning algorithms that can reason about function atop pure +structure. In this paper, we present DiffuseBot, a physics-augmented diffusion +model that generates soft robot morphologies capable of excelling in a wide +spectrum of tasks. DiffuseBot bridges the gap between virtually generated +content and physical utility by (i) augmenting the diffusion process with a +physical dynamical simulation which provides a certificate of performance, and +(ii) introducing a co-design procedure that jointly optimizes physical design +and control by leveraging information about physical sensitivities from +differentiable simulation. We showcase a range of simulated and fabricated +robots along with their capabilities. Check our website at +https://diffusebot.github.io/ + +
+
+ comment: NeurIPS 2023. Project page: https://diffusebot.github.io/ +
+
+
+
+
+ + ☆ MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced + Training + + +
+ Contrastive pretraining of image-text foundation models, such as CLIP, +demonstrated excellent zero-shot performance and improved robustness on a wide +range of downstream tasks. However, these models utilize large +transformer-based encoders with significant memory and latency overhead which +pose challenges for deployment on mobile devices. In this work, we introduce +MobileCLIP -- a new family of efficient image-text models optimized for runtime +performance along with a novel and efficient training approach, namely +multi-modal reinforced training. The proposed training approach leverages +knowledge transfer from an image captioning model and an ensemble of strong +CLIP encoders to improve the accuracy of efficient models. Our approach avoids +train-time compute overhead by storing the additional knowledge in a reinforced +dataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for +zero-shot classification and retrieval tasks on several datasets. Our +MobileCLIP-S2 variant is 2.3$\times$ faster while more accurate compared to +previous best CLIP model based on ViT-B/16. We further demonstrate the +effectiveness of our multi-modal reinforced training by training a CLIP model +based on ViT-B/16 image backbone and achieving +2.9% average performance +improvement on 38 evaluation benchmarks compared to the previous best. +Moreover, we show that the proposed approach achieves 10$\times$-1000$\times$ +improved learning efficiency when compared with non-reinforced CLIP training. + +
+
+
+
+
+ + ☆ Scalable Extraction of Training Data from (Production) Language Models + + +
+ This paper studies extractable memorization: training data that an adversary +can efficiently extract by querying a machine learning model without prior +knowledge of the training dataset. We show an adversary can extract gigabytes +of training data from open-source language models like Pythia or GPT-Neo, +semi-open models like LLaMA or Falcon, and closed models like ChatGPT. Existing +techniques from the literature suffice to attack unaligned models; in order to +attack the aligned ChatGPT, we develop a new divergence attack that causes the +model to diverge from its chatbot-style generations and emit training data at a +rate 150x higher than when behaving properly. Our methods show practical +attacks can recover far more data than previously thought, and reveal that +current alignment techniques do not eliminate memorization. + +
+
+
+
+
+ + ☆ Is This the Subspace You Are Looking for? An Interpretability Illusion + for Subspace Activation Patching NeurIPS 2023 + + +
+ Mechanistic interpretability aims to understand model behaviors in terms of +specific, interpretable features, often hypothesized to manifest as +low-dimensional subspaces of activations. Specifically, recent studies have +explored subspace interventions (such as activation patching) as a way to +simultaneously manipulate model behavior and attribute the features behind it +to given subspaces. + In this work, we demonstrate that these two aims diverge, potentially leading +to an illusory sense of interpretability. Counterintuitively, even if a +subspace intervention makes the model's output behave as if the value of a +feature was changed, this effect may be achieved by activating a dormant +parallel pathway leveraging another subspace that is causally disconnected from +model outputs. We demonstrate this phenomenon in a distilled mathematical +example, in two real-world domains (the indirect object identification task and +factual recall), and present evidence for its prevalence in practice. In the +context of factual recall, we further show a link to rank-1 fact editing, +providing a mechanistic explanation for previous work observing an +inconsistency between fact editing performance and fact localization. + However, this does not imply that activation patching of subspaces is +intrinsically unfit for interpretability. To contextualize our findings, we +also show what a success case looks like in a task (indirect object +identification) where prior manual circuit analysis informs an understanding of +the location of a feature. We explore the additional evidence needed to argue +that a patched subspace is faithful. + +
+
+ comment: NeurIPS 2023 Workshop on Attributing Model Behavior at Scale +
+
+
+
+
+ + ☆ When the Few Outweigh the Many: Illicit Content Recognition with + Few-Shot Learning + + +
+ The anonymity and untraceability benefits of the Dark web account for the +exponentially-increased potential of its popularity while creating a suitable +womb for many illicit activities, to date. Hence, in collaboration with +cybersecurity and law enforcement agencies, research has provided approaches +for recognizing and classifying illicit activities with most exploiting textual +dark web markets' content recognition; few such approaches use images that +originated from dark web content. This paper investigates this alternative +technique for recognizing illegal activities from images. In particular, we +investigate label-agnostic learning techniques like One-Shot and Few-Shot +learning featuring the use Siamese neural networks, a state-of-the-art approach +in the field. Our solution manages to handle small-scale datasets with +promising accuracy. In particular, Siamese neural networks reach 90.9% on +20-Shot experiments over a 10-class dataset; this leads us to conclude that +such models are a promising and cheaper alternative to the definition of +automated law-enforcing machinery over the dark web. + +
+
+
+
+
+ + ☆ Computational Hypergraph Discovery, a Gaussian Process framework for + connecting the dots + + +
+ Most scientific challenges can be framed into one of the following three +levels of complexity of function approximation. Type 1: Approximate an unknown +function given input/output data. Type 2: Consider a collection of variables +and functions, some of which are unknown, indexed by the nodes and hyperedges +of a hypergraph (a generalized graph where edges can connect more than two +vertices). Given partial observations of the variables of the hypergraph +(satisfying the functional dependencies imposed by its structure), approximate +all the unobserved variables and unknown functions. Type 3: Expanding on Type +2, if the hypergraph structure itself is unknown, use partial observations of +the variables of the hypergraph to discover its structure and approximate its +unknown functions. While most Computational Science and Engineering and +Scientific Machine Learning challenges can be framed as Type 1 and Type 2 +problems, many scientific problems can only be categorized as Type 3. Despite +their prevalence, these Type 3 challenges have been largely overlooked due to +their inherent complexity. Although Gaussian Process (GP) methods are sometimes +perceived as well-founded but old technology limited to Type 1 curve fitting, +their scope has recently been expanded to Type 2 problems. In this paper, we +introduce an interpretable GP framework for Type 3 problems, targeting the +data-driven discovery and completion of computational hypergraphs. Our approach +is based on a kernel generalization of Row Echelon Form reduction from linear +systems to nonlinear ones and variance-based analysis. Here, variables are +linked via GPs and those contributing to the highest data variance unveil the +hypergraph's structure. We illustrate the scope and efficiency of the proposed +approach with applications to (algebraic) equation discovery, network discovery +(gene pathways, chemical, and mechanical) and raw data analysis. + +
+
+ comment: The code for the algorithm introduced in this paper and its + application to various examples are available for download (and as as an + installable python library/package) at + https://github.com/TheoBourdais/ComputationalHypergraphDiscovery +
+
+
+
+
+ + ☆ An Investigation of Time Reversal Symmetry in Reinforcement Learning + + +
+ One of the fundamental challenges associated with reinforcement learning (RL) +is that collecting sufficient data can be both time-consuming and expensive. In +this paper, we formalize a concept of time reversal symmetry in a Markov +decision process (MDP), which builds upon the established structure of +dynamically reversible Markov chains (DRMCs) and time-reversibility in +classical physics. Specifically, we investigate the utility of this concept in +reducing the sample complexity of reinforcement learning. We observe that +utilizing the structure of time reversal in an MDP allows every environment +transition experienced by an agent to be transformed into a feasible +reverse-time transition, effectively doubling the number of experiences in the +environment. To test the usefulness of this newly synthesized data, we develop +a novel approach called time symmetric data augmentation (TSDA) and investigate +its application in both proprioceptive and pixel-based state within the realm +of off-policy, model-free RL. Empirical evaluations showcase how these +synthetic transitions can enhance the sample efficiency of RL agents in time +reversible scenarios without friction or contact. We also test this method in +more realistic environments where these assumptions are not globally satisfied. +We find that TSDA can significantly degrade sample efficiency and policy +performance, but can also improve sample efficiency under the right conditions. +Ultimately we conclude that time symmetry shows promise in enhancing the sample +efficiency of reinforcement learning and provide guidance when the environment +and reward structures are of an appropriate form for TSDA to be employed +effectively. + +
+
+
+
+
+ + ☆ On the Impact of Sampling on Deep Sequential State Estimation + + +
+ State inference and parameter learning in sequential models can be +successfully performed with approximation techniques that maximize the evidence +lower bound to the marginal log-likelihood of the data distribution. These +methods may be referred to as Dynamical Variational Autoencoders, and our +specific focus lies on the deep Kalman filter. It has been shown that the ELBO +objective can oversimplify data representations, potentially compromising +estimation quality. Tighter Monte Carlo objectives have been proposed in the +literature to enhance generative modeling performance. For instance, the IWAE +objective uses importance weights to reduce the variance of marginal +log-likelihood estimates. In this paper, importance sampling is applied to the +DKF framework for learning deep Markov models, resulting in the IW-DKF, which +shows an improvement in terms of log-likelihood estimates and KL divergence +between the variational distribution and the transition model. The framework +using the sampled DKF update rule is also accommodated to address sequential +state and parameter estimation when working with highly non-linear +physics-based models. An experiment with the 3-space Lorenz attractor shows an +enhanced generative modeling performance and also a decrease in RMSE when +estimating the model parameters and latent states, indicating that tighter MCOs +lead to improved state inference performance. + +
+
+ comment: To appear in the Proceedings of the Asilomar Conference on Signals, + Systems, and Computers, October 2023, 5 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Goal-conditioned Offline Planning from Curious Exploration + + +
+ Curiosity has established itself as a powerful exploration strategy in deep +reinforcement learning. Notably, leveraging expected future novelty as +intrinsic motivation has been shown to efficiently generate exploratory +trajectories, as well as a robust dynamics model. We consider the challenge of +extracting goal-conditioned behavior from the products of such unsupervised +exploration techniques, without any additional environment interaction. We find +that conventional goal-conditioned reinforcement learning approaches for +extracting a value function and policy fall short in this difficult offline +setting. By analyzing the geometry of optimal goal-conditioned value functions, +we relate this issue to a specific class of estimation artifacts in learned +values. In order to mitigate their occurrence, we propose to combine +model-based planning over learned value landscapes with a graph-based value +aggregation scheme. We show how this combination can correct both local and +global artifacts, obtaining significant improvements in zero-shot goal-reaching +performance across diverse simulated environments. + +
+
+
+
+
+ + ☆ FedECA: A Federated External Control Arm Method for Causal Inference + with Time-To-Event Data in Distributed Settings + + +
+ External control arms (ECA) can inform the early clinical development of +experimental drugs and provide efficacy evidence for regulatory approval in +non-randomized settings. However, the main challenge of implementing ECA lies +in accessing real-world data or historical clinical trials. Indeed, data +sharing is often not feasible due to privacy considerations related to data +leaving the original collection centers, along with pharmaceutical companies' +competitive motives. In this paper, we leverage a privacy-enhancing technology +called federated learning (FL) to remove some of the barriers to data sharing. +We introduce a federated learning inverse probability of treatment weighted +(IPTW) method for time-to-event outcomes called FedECA which eases the +implementation of ECA by limiting patients' data exposure. We show with +extensive experiments that FedECA outperforms its closest competitor, +matching-adjusted indirect comparison (MAIC), in terms of statistical power and +ability to balance the treatment and control groups. To encourage the use of +such methods, we publicly release our code which relies on Substra, an +open-source FL software with proven experience in privacy-sensitive contexts. + +
+
+ comment: code available at: https://github.com/owkin/fedeca +
+
+
+
+
+ + ☆ Bidirectional Reactive Programming for Machine Learning + + +
+ Reactive languages are dedicated to the programming of systems which interact +continuously and concurrently with their environment. Values take the form of +unbounded streams modeling the (discrete) passing of time or the sequence of +concurrent interactions. While conventional reactivity models recurrences +forward in time, we introduce a symmetric reactive construct enabling backward +recurrences. Constraints on the latter allow to make the implementation +practical. Machine Learning (ML) systems provide numerous motivations for all +of this: we demonstrate that reverse-mode automatic differentiation, +backpropagation, batch normalization, bidirectional recurrent neural networks, +training and reinforcement learning algorithms, are all naturally captured as +bidirectional reactive programs. + +
+
+
+
+
+ + ☆ Machine learning force-field models for metallic spin glass + + +
+ Metallic spin glass systems, such as dilute magnetic alloys, are +characterized by randomly distributed local moments coupled to each other +through a long-range electron-mediated effective interaction. We present a +scalable machine learning (ML) framework for dynamical simulations of metallic +spin glasses. A Behler-Parrinello type neural-network model, based on the +principle of locality, is developed to accurately and efficiently predict +electron-induced local magnetic fields that drive the spin dynamics. A crucial +component of the ML model is a proper symmetry-invariant representation of +local magnetic environment which is direct input to the neural net. We develop +such a magnetic descriptor by incorporating the spin degrees of freedom into +the atom-centered symmetry function methods which are widely used in ML +force-field models for quantum molecular dynamics. We apply our approach to +study the relaxation dynamics of an amorphous generalization of the s-d model. +Our work highlights the promising potential of ML models for large-scale +dynamical modeling of itinerant magnets with quenched disorder. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Adaptive Step Sizes for Preconditioned Stochastic Gradient Descent + + +
+ This paper proposes a novel approach to adaptive step sizes in stochastic +gradient descent (SGD) by utilizing quantities that we have identified as +numerically traceable -- the Lipschitz constant for gradients and a concept of +the local variance in search directions. Our findings yield a nearly +hyperparameter-free algorithm for stochastic optimization, which has provable +convergence properties when applied to quadratic problems and exhibits truly +problem adaptive behavior on classical image classification tasks. Our +framework enables the potential inclusion of a preconditioner, thereby enabling +the implementation of adaptive step sizes for stochastic second-order +optimization methods. + +
+
+
+
+
+ + ☆ Image segmentation with traveling waves in an exactly solvable recurrent + neural network + + +
+ We study image segmentation using spatiotemporal dynamics in a recurrent +neural network where the state of each unit is given by a complex number. We +show that this network generates sophisticated spatiotemporal dynamics that can +effectively divide an image into groups according to a scene's structural +characteristics. Using an exact solution of the recurrent network's dynamics, +we present a precise description of the mechanism underlying object +segmentation in this network, providing a clear mathematical interpretation of +how the network performs this task. We then demonstrate a simple algorithm for +object segmentation that generalizes across inputs ranging from simple +geometric objects in grayscale images to natural images. Object segmentation +across all images is accomplished with one recurrent neural network that has a +single, fixed set of weights. This demonstrates the expressive potential of +recurrent neural networks when constructed using a mathematical approach that +brings together their structure, dynamics, and computation. + +
+
+
+
+
+ + ☆ Debiasing Multimodal Models via Causal Information Minimization EMNLP 2023 + + +
+ Most existing debiasing methods for multimodal models, including causal +intervention and inference methods, utilize approximate heuristics to represent +the biases, such as shallow features from early stages of training or unimodal +features for multimodal tasks like VQA, etc., which may not be accurate. In +this paper, we study bias arising from confounders in a causal graph for +multimodal data and examine a novel approach that leverages causally-motivated +information minimization to learn the confounder representations. Robust +predictive features contain diverse information that helps a model generalize +to out-of-distribution data. Hence, minimizing the information content of +features obtained from a pretrained biased model helps learn the simplest +predictive features that capture the underlying data distribution. We treat +these features as confounder representations and use them via methods motivated +by causal theory to remove bias from models. We find that the learned +confounder representations indeed capture dataset biases, and the proposed +debiasing methods improve out-of-distribution (OOD) performance on multiple +multimodal datasets without sacrificing in-distribution performance. +Additionally, we introduce a novel metric to quantify the sufficiency of +spurious features in models' predictions that further demonstrates the +effectiveness of our proposed methods. Our code is available at: +https://github.com/Vaidehi99/CausalInfoMin + +
+
+ comment: EMNLP 2023 Findings (16 pages) +
+
+
+
+
+ + ☆ Multinomial belief networks + + +
+ A Bayesian approach to machine learning is attractive when we need to +quantify uncertainty, deal with missing observations, when samples are scarce, +or when the data is sparse. All of these commonly apply when analysing +healthcare data. To address these analytical requirements, we propose a deep +generative model for multinomial count data where both the weights and hidden +units of the network are Dirichlet distributed. A Gibbs sampling procedure is +formulated that takes advantage of a series of augmentation relations, +analogous to the Zhou-Cong-Chen model. We apply the model on small handwritten +digits, and a large experimental dataset of DNA mutations in cancer, and we +show how the model is able to extract biologically meaningful meta-signatures +in a fully data-driven way. + +
+
+ comment: 9 pages, 3 figs; supplement: 13 pages +
+
+
+
+
+ + ☆ Dendrogram distance: an evaluation metric for generative networks using + hierarchical clustering + + +
+ We present a novel metric for generative modeling evaluation, focusing +primarily on generative networks. The method uses dendrograms to represent real +and fake data, allowing for the divergence between training and generated +samples to be computed. This metric focus on mode collapse, targeting +generators that are not able to capture all modes in the training set. To +evaluate the proposed method it is introduced a validation scheme based on +sampling from real datasets, therefore the metric is evaluated in a controlled +environment and proves to be competitive with other state-of-the-art +approaches. + +
+
+
+
+
+ + ☆ Compressing the Backward Pass of Large-Scale Neural Architectures by + Structured Activation Pruning + + +
+ The rise of Deep Neural Networks (DNNs) has led to an increase in model size +and complexity, straining the memory capacity of GPUs. Sparsity in DNNs, +characterized as structural or ephemeral, has gained attention as a solution. +This work focuses on ephemeral sparsity, aiming to reduce memory consumption +during training. It emphasizes the significance of activations, an often +overlooked component, and their role in memory usage. This work employs +structured pruning in Block Sparse Compressed Row (BSR) format in combination +with a magnitude-based criterion to efficiently prune activations. We +furthermore introduce efficient block-sparse operators for GPUs and showcase +their effectiveness, as well as the superior compression offered by block +sparsity. We report the effectiveness of activation pruning by evaluating +training speed, accuracy, and memory usage of large-scale neural architectures +on the example of ResMLP on image classification tasks. As a result, we observe +a memory reduction of up to 32\% while maintaining accuracy. Ultimately, our +approach aims to democratize large-scale model training, reduce GPU +requirements, and address ecological concerns. + +
+
+ comment: 8 pages, 10 figures, submitted to the 6th AccML workshop at HiPEAC + conference 2024 +
+
+
+
+
+ + ☆ Optimisation-Based Multi-Modal Semantic Image Editing + + +
+ Image editing affords increased control over the aesthetics and content of +generated images. Pre-existing works focus predominantly on text-based +instructions to achieve desired image modifications, which limit edit precision +and accuracy. In this work, we propose an inference-time editing optimisation, +designed to extend beyond textual edits to accommodate multiple editing +instruction types (e.g. spatial layout-based; pose, scribbles, edge maps). We +propose to disentangle the editing task into two competing subtasks: successful +local image modifications and global content consistency preservation, where +subtasks are guided through two dedicated loss functions. By allowing to adjust +the influence of each loss function, we build a flexible editing solution that +can be adjusted to user preferences. We evaluate our method using text, pose +and scribble edit conditions, and highlight our ability to achieve complex +edits, through both qualitative and quantitative experiments. + +
+
+
+
+
+ + ☆ Imputation using training labels and classification via label imputation + + +
+ Missing data is a common problem in practical settings. Various imputation +methods have been developed to deal with missing data. However, even though the +label is usually available in the training data, the common practice of +imputation usually only relies on the input and ignores the label. In this +work, we illustrate how stacking the label into the input can significantly +improve the imputation of the input. In addition, we propose a classification +strategy that initializes the predicted test label with missing values and +stacks the label with the input for imputation. This allows imputing the label +and the input at the same time. Also, the technique is capable of handling data +training with missing labels without any prior imputation and is applicable to +continuous, categorical, or mixed-type data. Experiments show promising results +in terms of accuracy. + +
+
+
+
+
+ + ☆ Digital Twin-Enhanced Deep Reinforcement Learning for Resource + Management in Networks Slicing + + +
+ Network slicing-based communication systems can dynamically and efficiently +allocate resources for diversified services. However, due to the limitation of +the network interface on channel access and the complexity of the resource +allocation, it is challenging to achieve an acceptable solution in the +practical system without precise prior knowledge of the dynamics probability +model of the service requests. Existing work attempts to solve this problem +using deep reinforcement learning (DRL), however, such methods usually require +a lot of interaction with the real environment in order to achieve good +results. In this paper, a framework consisting of a digital twin and +reinforcement learning agents is present to handle the issue. Specifically, we +propose to use the historical data and the neural networks to build a digital +twin model to simulate the state variation law of the real environment. Then, +we use the data generated by the network slicing environment to calibrate the +digital twin so that it is in sync with the real environment. Finally, DRL for +slice optimization optimizes its own performance in this virtual +pre-verification environment. We conducted an exhaustive verification of the +proposed digital twin framework to confirm its scalability. Specifically, we +propose to use loss landscapes to visualize the generalization of DRL +solutions. We explore a distillation-based optimization scheme for lightweight +slicing strategies. In addition, we also extend the framework to offline +reinforcement learning, where solutions can be used to obtain intelligent +decisions based solely on historical data. Numerical simulation experiments +show that the proposed digital twin can significantly improve the performance +of the slice optimization strategy. + +
+
+
+
+
+ + ☆ A unified weighting framework for evaluating nearest neighbour + classification + + +
+ We present the first comprehensive and large-scale evaluation of classical +(NN), fuzzy (FNN) and fuzzy rough (FRNN) nearest neighbour classification. We +show that existing proposals for nearest neighbour weighting can be +standardised in the form of kernel functions, applied to the distance values +and/or ranks of the nearest neighbours of a test instance. Furthermore, we +identify three commonly used distance functions and four scaling measures. We +systematically evaluate these choices on a collection of 85 real-life +classification datasets. We find that NN, FNN and FRNN all perform best with +Boscovich distance. NN and FRNN perform best with a combination of Samworth +rank- and distance weights and scaling by the mean absolute deviation around +the median ($r_1$), the standard deviaton ($r_2$) or the interquartile range +($r_{\infty}^*$), while FNN performs best with only Samworth distance-weights +and $r_1$- or $r_2$-scaling. We also introduce a new kernel based on fuzzy +Yager negation, and show that NN achieves comparable performance with Yager +distance-weights, which are simpler to implement than a combination of Samworth +distance- and rank-weights. Finally, we demonstrate that FRNN generally +outperforms NN, which in turns performs systematically better than FNN. + +
+
+
+
+
+ + ☆ Power Hungry Processing: Watts Driving the Cost of AI Deployment? + + +
+ Recent years have seen a surge in the popularity of commercial AI products +based on generative, multi-purpose AI systems promising a unified approach to +building machine learning (ML) models into technology. However, this ambition +of "generality" comes at a steep cost to the environment, given the amount of +energy these systems require and the amount of carbon that they emit. In this +work, we propose the first systematic comparison of the ongoing inference cost +of various categories of ML systems, covering both task-specific (i.e. +finetuned models that carry out a single task) and `general-purpose' models, +(i.e. those trained for multiple tasks). We measure deployment cost as the +amount of energy and carbon required to perform 1,000 inferences on +representative benchmark dataset using these models. We find that +multi-purpose, generative architectures are orders of magnitude more expensive +than task-specific systems for a variety of tasks, even when controlling for +the number of model parameters. We conclude with a discussion around the +current trend of deploying multi-purpose generative ML systems, and caution +that their utility should be more intentionally weighed against increased costs +in terms of energy and emissions. All the data from our study can be accessed +via an interactive demo to carry out further exploration and analysis. + +
+
+
+
+
+ + ☆ Data-efficient operator learning for solving high Mach number fluid flow + problems + + +
+ We consider the problem of using SciML to predict solutions of high Mach +fluid flows over irregular geometries. In this setting, data is limited, and so +it is desirable for models to perform well in the low-data setting. We show +that Neural Basis Functions (NBF), which learns a basis of behavior modes from +the data and then uses this basis to make predictions, is more effective than a +basis-unaware baseline model. In addition, we identify continuing challenges in +the space of predicting solutions for this type of problem. + +
+
+
+
+
+ + ☆ Attentional Graph Neural Networks for Robust Massive Network + Localization + + +
+ Graph neural networks (GNNs) have gained significant popularity for +classification tasks in machine learning, yet their applications to regression +problems remain limited. Concurrently, attention mechanisms have emerged as +powerful tools in sequential learning tasks. In this paper, we employ GNNs and +attention mechanisms to address a classical but challenging nonlinear +regression problem: network localization. We propose a novel GNN-based network +localization method that achieves exceptional stability and accuracy in the +presence of severe non-line-of-sight (NLOS) propagations, while eliminating the +need for laborious offline calibration or NLOS identification. Extensive +experimental results validate the effectiveness and high accuracy of our +GNN-based localization model, particularly in challenging NLOS scenarios. +However, the proposed GNN-based model exhibits limited flexibility, and its +accuracy is highly sensitive to a specific hyperparameter that determines the +graph structure. To address the limitations and extend the applicability of the +GNN-based model to real scenarios, we introduce two attentional graph neural +networks (AGNNs) that offer enhanced flexibility and the ability to +automatically learn the optimal hyperparameter for each node. Experimental +results confirm that the AGNN models are able to enhance localization accuracy, +providing a promising solution for real-world applications. We also provide +some analyses of the improved performance achieved by the AGNN models from the +perspectives of dynamic attention and signal denoising characteristics. + +
+
+
+
+
+ + ☆ Identifiable Feature Learning for Spatial Data with Nonlinear ICA + + +
+ Recently, nonlinear ICA has surfaced as a popular alternative to the many +heuristic models used in deep representation learning and disentanglement. An +advantage of nonlinear ICA is that a sophisticated identifiability theory has +been developed; in particular, it has been proven that the original components +can be recovered under sufficiently strong latent dependencies. Despite this +general theory, practical nonlinear ICA algorithms have so far been mainly +limited to data with one-dimensional latent dependencies, especially +time-series data. In this paper, we introduce a new nonlinear ICA framework +that employs $t$-process (TP) latent components which apply naturally to data +with higher-dimensional dependency structures, such as spatial and +spatio-temporal data. In particular, we develop a new learning and inference +algorithm that extends variational inference methods to handle the combination +of a deep neural network mixing function with the TP prior, and employs the +method of inducing points for computational efficacy. On the theoretical side, +we show that such TP independent components are identifiable under very general +conditions. Further, Gaussian Process (GP) nonlinear ICA is established as a +limit of the TP Nonlinear ICA model, and we prove that the identifiability of +the latent components at this GP limit is more restricted. Namely, those +components are identifiable if and only if they have distinctly different +covariance kernels. Our algorithm and identifiability theorems are explored on +simulated spatial data and real world spatio-temporal data. + +
+
+ comment: Work under review +
+
+
+
+
+ + ☆ Modular Neural Networks for Time Series Forecasting: Interpretability + and Feature Selection using Attention + + +
+ Multivariate time series have many applications, from healthcare and +meteorology to life science. Although deep learning models have shown excellent +predictive performance for time series, they have been criticised for being +"black-boxes" or non-interpretable. This paper proposes a novel modular neural +network model for multivariate time series prediction that is interpretable by +construction. A recurrent neural network learns the temporal dependencies in +the data while an attention-based feature selection component selects the most +relevant features and suppresses redundant features used in the learning of the +temporal dependencies. A modular deep network is trained from the selected +features independently to show the users how features influence outcomes, +making the model interpretable. Experimental results show that this approach +can outperform state-of-the-art interpretable Neural Additive Models (NAM) and +variations thereof in both regression and classification of time series tasks, +achieving a predictive performance that is comparable to the top +non-interpretable methods for time series, LSTM and XGBoost. + +
+
+
+
+
+ + ☆ 1-Lipschitz Layers Compared: Memory, Speed, and Certifiable Robustness + + +
+ The robustness of neural networks against input perturbations with bounded +magnitude represents a serious concern in the deployment of deep learning +models in safety-critical systems. Recently, the scientific community has +focused on enhancing certifiable robustness guarantees by crafting 1-Lipschitz +neural networks that leverage Lipschitz bounded dense and convolutional layers. +Although different methods have been proposed in the literature to achieve this +goal, understanding the performance of such methods is not straightforward, +since different metrics can be relevant (e.g., training time, memory usage, +accuracy, certifiable robustness) for different applications. For this reason, +this work provides a thorough theoretical and empirical comparison between +methods by evaluating them in terms of memory usage, speed, and certifiable +robust accuracy. The paper also provides some guidelines and recommendations to +support the user in selecting the methods that work best depending on the +available resources. We provide code at +https://github.com/berndprach/1LipschitzLayersCompared. + +
+
+
+
+
+ + ☆ Decomposer: Semi-supervised Learning of Image Restoration and Image + Decomposition + + +
+ We present Decomposer, a semi-supervised reconstruction model that decomposes +distorted image sequences into their fundamental building blocks - the original +image and the applied augmentations, i.e., shadow, light, and occlusions. To +solve this problem, we use the SIDAR dataset that provides a large number of +distorted image sequences: each sequence contains images with shadows, +lighting, and occlusions applied to an undistorted version. Each distortion +changes the original signal in different ways, e.g., additive or multiplicative +noise. We propose a transformer-based model to explicitly learn this +decomposition. The sequential model uses 3D Swin-Transformers for +spatio-temporal encoding and 3D U-Nets as prediction heads for individual parts +of the decomposition. We demonstrate that by separately pre-training our model +on weakly supervised pseudo labels, we can steer our model to optimize for our +ambiguous problem definition and learn to differentiate between the different +image distortions. + +
+
+
+
+
+ + ☆ Large Language Models Suffer From Their Own Output: An Analysis of the + Self-Consuming Training Loop + + +
+ Large language models (LLM) have become state of the art in many benchmarks +and conversational LLM applications like ChatGPT are now widely used by the +public. Those LLMs can be used to generate large amounts of content which is +posted on the internet to various platforms. As LLMs are trained on datasets +usually collected from the internet, this LLM-generated content might be used +to train the next generation of LLMs. Therefore, a self-consuming training loop +emerges in which new LLM generations are trained on the output from the +previous generations. We empirically study this self-consuming training loop +using a novel dataset to analytically and accurately measure quality and +diversity of generated outputs. We find that this self-consuming training loop +initially improves both quality and diversity. However, after a few generations +the output inevitably degenerates in diversity. We find that the rate of +degeneration depends on the proportion of real and generated data. + +
+
+
+
+
+ + ☆ The HR-Calculus: Enabling Information Processing with Quaternion Algebra + + +
+ From their inception, quaternions and their division algebra have proven to +be advantageous in modelling rotation/orientation in three-dimensional spaces +and have seen use from the initial formulation of electromagnetic filed theory +through to forming the basis of quantum filed theory. Despite their impressive +versatility in modelling real-world phenomena, adaptive information processing +techniques specifically designed for quaternion-valued signals have only +recently come to the attention of the machine learning, signal processing, and +control communities. The most important development in this direction is +introduction of the HR-calculus, which provides the required mathematical +foundation for deriving adaptive information processing techniques directly in +the quaternion domain. In this article, the foundations of the HR-calculus are +revised and the required tools for deriving adaptive learning techniques +suitable for dealing with quaternion-valued signals, such as the gradient +operator, chain and product derivative rules, and Taylor series expansion are +presented. This serves to establish the most important applications of adaptive +information processing in the quaternion domain for both single-node and +multi-node formulations. The article is supported by Supplementary Material, +which will be referred to as SM. + +
+
+
+
+
+ + ☆ Equilibrium in the Computing Continuum through Active Inference + + +
+ Computing Continuum (CC) systems are challenged to ensure the intricate +requirements of each computational tier. Given the system's scale, the Service +Level Objectives (SLOs) which are expressed as these requirements, must be +broken down into smaller parts that can be decentralized. We present our +framework for collaborative edge intelligence enabling individual edge devices +to (1) develop a causal understanding of how to enforce their SLOs, and (2) +transfer knowledge to speed up the onboarding of heterogeneous devices. Through +collaboration, they (3) increase the scope of SLO fulfillment. We implemented +the framework and evaluated a use case in which a CC system is responsible for +ensuring Quality of Service (QoS) and Quality of Experience (QoE) during video +streaming. Our results showed that edge devices required only ten training +rounds to ensure four SLOs; furthermore, the underlying causal structures were +also rationally explainable. The addition of new types of devices can be done a +posteriori, the framework allowed them to reuse existing models, even though +the device type had been unknown. Finally, rebalancing the load within a device +cluster allowed individual edge devices to recover their SLO compliance after a +network failure from 22% to 89%. + +
+
+
+
+
+ + ☆ Rescuing referral failures during automated diagnosis of domain-shifted + medical images + + +
+ The success of deep learning models deployed in the real world depends +critically on their ability to generalize well across diverse data domains. +Here, we address a fundamental challenge with selective classification during +automated diagnosis with domain-shifted medical images. In this scenario, +models must learn to avoid making predictions when label confidence is low, +especially when tested with samples far removed from the training set +(covariate shift). Such uncertain cases are typically referred to the clinician +for further analysis and evaluation. Yet, we show that even state-of-the-art +domain generalization approaches fail severely during referral when tested on +medical images acquired from a different demographic or using a different +technology. We examine two benchmark diagnostic medical imaging datasets +exhibiting strong covariate shifts: i) diabetic retinopathy prediction with +retinal fundus images and ii) multilabel disease prediction with chest X-ray +images. We show that predictive uncertainty estimates do not generalize well +under covariate shifts leading to non-monotonic referral curves, and severe +drops in performance (up to 50%) at high referral rates (>70%). We evaluate +novel combinations of robust generalization and post hoc referral approaches, +that rescue these failures and achieve significant performance improvements, +typically >10%, over baseline methods. Our study identifies a critical +challenge with referral in domain-shifted medical images and finds key +applications in reliable, automated disease diagnosis. + +
+
+
+
+
+ + ☆ Asynchronous Wireless Federated Learning with Probabilistic Client + Selection + + +
+ Federated learning (FL) is a promising distributed learning framework where +distributed clients collaboratively train a machine learning model coordinated +by a server. To tackle the stragglers issue in asynchronous FL, we consider +that each client keeps local updates and probabilistically transmits the local +model to the server at arbitrary times. We first derive the (approximate) +expression for the convergence rate based on the probabilistic client +selection. Then, an optimization problem is formulated to trade off the +convergence rate of asynchronous FL and mobile energy consumption by joint +probabilistic client selection and bandwidth allocation. We develop an +iterative algorithm to solve the non-convex problem globally optimally. +Experiments demonstrate the superiority of the proposed approach compared with +the traditional schemes. + +
+
+ comment: To appear in IEEE Transactions on Wireless Communications +
+
+
+
+
+ + ☆ Sluggish and Chemically-Biased Interstitial Diffusion in Concentrated + Solid Solution Alloys: Mechanisms and Methods + + +
+ Interstitial diffusion is a pivotal process that governs the phase stability +and irradiation response of materials in non-equilibrium conditions. In this +work, we study sluggish and chemically-biased interstitial diffusion in Fe-Ni +concentrated solid solution alloys (CSAs) by combining machine learning (ML) +and kinetic Monte Carlo (kMC), where ML is used to accurately and efficiently +predict the migration energy barriers on-the-fly. The ML-kMC reproduces the +diffusivity that was reported by molecular dynamics results at high +temperatures. With this powerful tool, we find that the observed sluggish +diffusion and the "Ni-Ni-Ni"-biased diffusion in Fe-Ni alloys are ascribed to a +unique "Barrier Lock" mechanism, whereas the "Fe-Fe-Fe"-biased diffusion is +influenced by a "Component Dominance" mechanism. Inspired by the mentioned +mechanisms, a practical AvgS-kMC method is proposed for conveniently and +swiftly determining interstitial-mediated diffusivity by only relying on the +mean energy barriers of migration patterns. Combining the AvgS-kMC with the +differential evolutionary algorithm, an inverse design strategy for optimizing +sluggish diffusion properties is applied to emphasize the crucial role of +favorable migration patterns. + +
+
+ comment: 30 pages,9 figures +
+
+
+
+
+ + ☆ LEDITS++: Limitless Image Editing using Text-to-Image Models + + +
+ Text-to-image diffusion models have recently received increasing interest for +their astonishing ability to produce high-fidelity images from solely text +inputs. Subsequent research efforts aim to exploit and apply their capabilities +to real image editing. However, existing image-to-image methods are often +inefficient, imprecise, and of limited versatility. They either require +time-consuming fine-tuning, deviate unnecessarily strongly from the input +image, and/or lack support for multiple, simultaneous edits. To address these +issues, we introduce LEDITS++, an efficient yet versatile and precise textual +image manipulation technique. LEDITS++'s novel inversion approach requires no +tuning nor optimization and produces high-fidelity results with a few diffusion +steps. Second, our methodology supports multiple simultaneous edits and is +architecture-agnostic. Third, we use a novel implicit masking technique that +limits changes to relevant image regions. We propose the novel TEdBench++ +benchmark as part of our exhaustive evaluation. Our results demonstrate the +capabilities of LEDITS++ and its improvements over previous methods. The +project page is available at https://leditsplusplus-project.static.hf.space . + +
+
+
+
+
+ + ☆ Sinkhorn Flow: A Continuous-Time Framework for Understanding and + Generalizing the Sinkhorn Algorithm + + +
+ Many problems in machine learning can be formulated as solving +entropy-regularized optimal transport on the space of probability measures. The +canonical approach involves the Sinkhorn iterates, renowned for their rich +mathematical properties. Recently, the Sinkhorn algorithm has been recast +within the mirror descent framework, thus benefiting from classical +optimization theory insights. Here, we build upon this result by introducing a +continuous-time analogue of the Sinkhorn algorithm. This perspective allows us +to derive novel variants of Sinkhorn schemes that are robust to noise and bias. +Moreover, our continuous-time dynamics not only generalize but also offer a +unified perspective on several recently discovered dynamics in machine learning +and mathematics, such as the "Wasserstein mirror flow" of (Deb et al. 2023) or +the "mean-field Schr\"odinger equation" of (Claisse et al. 2023). + +
+
+
+
+
+ + ☆ Rethinking Intermediate Layers design in Knowledge Distillation for + Kidney and Liver Tumor Segmentation + + +
+ Knowledge distillation(KD) has demonstrated remarkable success across various +domains, but its application to medical imaging tasks, such as kidney and liver +tumor segmentation, has encountered challenges. Many existing KD methods are +not specifically tailored for these tasks. Moreover, prevalent KD methods often +lack a careful consideration of what and from where to distill knowledge from +the teacher to the student. This oversight may lead to issues like the +accumulation of training bias within shallower student layers, potentially +compromising the effectiveness of KD. To address these challenges, we propose +Hierarchical Layer-selective Feedback Distillation (HLFD). HLFD strategically +distills knowledge from a combination of middle layers to earlier layers and +transfers final layer knowledge to intermediate layers at both the feature and +pixel levels. This design allows the model to learn higher-quality +representations from earlier layers, resulting in a robust and compact student +model. Extensive quantitative evaluations reveal that HLFD outperforms existing +methods by a significant margin. For example, in the kidney segmentation task, +HLFD surpasses the student model (without KD) by over 10pp, significantly +improving its focus on tumor-specific features. From a qualitative standpoint, +the student model trained using HLFD excels at suppressing irrelevant +information and can focus sharply on tumor-specific details, which opens a new +pathway for more efficient and accurate diagnostic tools. + +
+
+ comment: Under-review at ISBI-2024 +
+
+
+
+
+ + ☆ Hyper-Relational Knowledge Graph Neural Network for Next POI + + +
+ With the advancement of mobile technology, Point of Interest (POI) +recommendation systems in Location-based Social Networks (LBSN) have brought +numerous benefits to both users and companies. Many existing works employ +Knowledge Graph (KG) to alleviate the data sparsity issue in LBSN. These +approaches primarily focus on modeling the pair-wise relations in LBSN to +enrich the semantics and thereby relieve the data sparsity issue. However, +existing approaches seldom consider the hyper-relations in LBSN, such as the +mobility relation (a 3-ary relation: user-POI-time). This makes the model hard +to exploit the semantics accurately. In addition, prior works overlook the rich +structural information inherent in KG, which consists of higher-order relations +and can further alleviate the impact of data sparsity.To this end, we propose a +Hyper-Relational Knowledge Graph Neural Network (HKGNN) model. In HKGNN, a +Hyper-Relational Knowledge Graph (HKG) that models the LBSN data is constructed +to maintain and exploit the rich semantics of hyper-relations. Then we proposed +a Hypergraph Neural Network to utilize the structural information of HKG in a +cohesive way. In addition, a self-attention network is used to leverage +sequential information and make personalized recommendations. Furthermore, side +information, essential in reducing data sparsity by providing background +knowledge of POIs, is not fully utilized in current methods. In light of this, +we extended the current dataset with available side information to further +lessen the impact of data sparsity. Results of experiments on four real-world +LBSN datasets demonstrate the effectiveness of our approach compared to +existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ PyTorch Geometric High Order: A Unified Library for High Order Graph + Neural Network + + +
+ We introduce PyTorch Geometric High Order (PyGHO), a library for High Order +Graph Neural Networks (HOGNNs) that extends PyTorch Geometric (PyG). Unlike +ordinary Message Passing Neural Networks (MPNNs) that exchange messages between +nodes, HOGNNs, encompassing subgraph GNNs and k-WL GNNs, encode node tuples, a +method previously lacking a standardized framework and often requiring complex +coding. PyGHO's main objective is to provide an unified and user-friendly +interface for various HOGNNs. It accomplishes this through streamlined data +structures for node tuples, comprehensive data processing utilities, and a +flexible suite of operators for high-order GNN methodologies. In this work, we +present a detailed in-depth of PyGHO and compare HOGNNs implemented with PyGHO +with their official implementation on real-world tasks. PyGHO achieves up to +$50\%$ acceleration and reduces the code needed for implementation by an order +of magnitude. Our library is available at +\url{https://github.com/GraphPKU/PygHO}. + +
+
+
+
+
+ + ☆ MultiModal-Learning for Predicting Molecular Properties: A Framework + Based on Image and Graph Structures + + +
+ The quest for accurate prediction of drug molecule properties poses a +fundamental challenge in the realm of Artificial Intelligence Drug Discovery +(AIDD). An effective representation of drug molecules emerges as a pivotal +component in this pursuit. Contemporary leading-edge research predominantly +resorts to self-supervised learning (SSL) techniques to extract meaningful +structural representations from large-scale, unlabeled molecular data, +subsequently fine-tuning these representations for an array of downstream +tasks. However, an inherent shortcoming of these studies lies in their singular +reliance on one modality of molecular information, such as molecule image or +SMILES representations, thus neglecting the potential complementarity of +various molecular modalities. In response to this limitation, we propose MolIG, +a novel MultiModaL molecular pre-training framework for predicting molecular +properties based on Image and Graph structures. MolIG model innovatively +leverages the coherence and correlation between molecule graph and molecule +image to execute self-supervised tasks, effectively amalgamating the strengths +of both molecular representation forms. This holistic approach allows for the +capture of pivotal molecular structural characteristics and high-level semantic +information. Upon completion of pre-training, Graph Neural Network (GNN) +Encoder is used for the prediction of downstream tasks. In comparison to +advanced baseline models, MolIG exhibits enhanced performance in downstream +tasks pertaining to molecular property prediction within benchmark groups such +as MoleculeNet Benchmark Group and ADMET Benchmark Group. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Pseudo-Likelihood Inference NeurIPS 2023 + + +
+ Simulation-Based Inference (SBI) is a common name for an emerging family of +approaches that infer the model parameters when the likelihood is intractable. +Existing SBI methods either approximate the likelihood, such as Approximate +Bayesian Computation (ABC) or directly model the posterior, such as Sequential +Neural Posterior Estimation (SNPE). While ABC is efficient on low-dimensional +problems, on higher-dimensional tasks, it is generally outperformed by SNPE, +which leverages function approximation. In this paper, we propose +Pseudo-Likelihood Inference (PLI), a new method that brings neural +approximation into ABC, making it competitive on challenging Bayesian system +identification tasks. By utilizing integral probability metrics, we introduce a +smooth likelihood kernel with an adaptive bandwidth that is updated based on +information-theoretic trust regions. Thanks to this formulation, our method (i) +allows for optimizing neural posteriors via gradient descent, (ii) does not +rely on summary statistics, and (iii) enables multiple observations as input. +In comparison to SNPE, it leads to improved performance when more data is +available. The effectiveness of PLI is evaluated on four classical SBI +benchmark tasks and on a highly dynamic physical system, showing particular +advantages on stochastic simulations and multi-modal posterior landscapes. + +
+
+ comment: 27 pages, 12 figures, Published as a conference paper at NeurIPS 2023 +
+
+
+
+
+ + ☆ Elucidating Discrepancy in Explanations of Predictive Models Developed + using EMR + + +
+ The lack of transparency and explainability hinders the clinical adoption of +Machine learning (ML) algorithms. While explainable artificial intelligence +(XAI) methods have been proposed, little research has focused on the agreement +between these methods and expert clinical knowledge. This study applies current +state-of-the-art explainability methods to clinical decision support algorithms +developed for Electronic Medical Records (EMR) data to analyse the concordance +between these factors and discusses causes for identified discrepancies from a +clinical and technical perspective. Important factors for achieving trustworthy +XAI solutions for clinical decision support are also discussed. + +
+
+
+
+
+ + ☆ Rethinking Backdoor Attacks on Dataset Distillation: A Kernel Method + Perspective + + +
+ Dataset distillation offers a potential means to enhance data efficiency in +deep learning. Recent studies have shown its ability to counteract backdoor +risks present in original training samples. In this study, we delve into the +theoretical aspects of backdoor attacks and dataset distillation based on +kernel methods. We introduce two new theory-driven trigger pattern generation +methods specialized for dataset distillation. Following a comprehensive set of +analyses and experiments, we show that our optimization-based trigger design +framework informs effective backdoor attacks on dataset distillation. Notably, +datasets poisoned by our designed trigger prove resilient against conventional +backdoor attack detection and mitigation methods. Our empirical results +validate that the triggers developed using our approaches are proficient at +executing resilient backdoor attacks. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ☆ Opening the Black Box: Towards inherently interpretable energy data + imputation models using building physics insight + + +
+ Missing data are frequently observed by practitioners and researchers in the +building energy modeling community. In this regard, advanced data-driven +solutions, such as Deep Learning methods, are typically required to reflect the +non-linear behavior of these anomalies. As an ongoing research question related +to Deep Learning, a model's applicability to limited data settings can be +explored by introducing prior knowledge in the network. This same strategy can +also lead to more interpretable predictions, hence facilitating the field +application of the approach. For that purpose, the aim of this paper is to +propose the use of Physics-informed Denoising Autoencoders (PI-DAE) for missing +data imputation in commercial buildings. In particular, the presented method +enforces physics-inspired soft constraints to the loss function of a Denoising +Autoencoder (DAE). In order to quantify the benefits of the physical component, +an ablation study between different DAE configurations is conducted. First, +three univariate DAEs are optimized separately on indoor air temperature, +heating, and cooling data. Then, two multivariate DAEs are derived from the +previous configurations. Eventually, a building thermal balance equation is +coupled to the last multivariate configuration to obtain PI-DAE. Additionally, +two commonly used benchmarks are employed to support the findings. It is shown +how introducing physical knowledge in a multivariate Denoising Autoencoder can +enhance the inherent model interpretability through the optimized physics-based +coefficients. While no significant improvement is observed in terms of +reconstruction error with the proposed PI-DAE, its enhanced robustness to +varying rates of missing data and the valuable insights derived from the +physics-based coefficients create opportunities for wider applications within +building systems and the built environment. + +
+
+ comment: Under review in Energy and Buildings +
+
+
+
+
+ + ☆ Outfit Completion via Conditional Set Transformation + + +
+ In this paper, we formulate the outfit completion problem as a set retrieval +task and propose a novel framework for solving this problem. The proposal +includes a conditional set transformation architecture with deep neural +networks and a compatibility-based regularization method. The proposed method +utilizes a map with permutation-invariant for the input set and +permutation-equivariant for the condition set. This allows retrieving a set +that is compatible with the input set while reflecting the properties of the +condition set. In addition, since this structure outputs the element of the +output set in a single inference, it can achieve a scalable inference speed +with respect to the cardinality of the output set. Experimental results on real +data reveal that the proposed method outperforms existing approaches in terms +of accuracy of the outfit completion task, condition satisfaction, and +compatibility of completion results. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Symmetry-regularized neural ordinary differential equations + + +
+ Neural Ordinary Differential Equations (Neural ODEs) is a class of deep +neural network models that interpret the hidden state dynamics of neural +networks as an ordinary differential equation, thereby capable of capturing +system dynamics in a continuous time framework. In this work, I integrate +symmetry regularization into Neural ODEs. In particular, I use continuous Lie +symmetry of ODEs and PDEs associated with the model to derive conservation laws +and add them to the loss function, making it physics-informed. This +incorporation of inherent structural properties into the loss function could +significantly improve robustness and stability of the model during training. To +illustrate this method, I employ a toy model that utilizes a cosine rate of +change in the hidden state, showcasing the process of identifying Lie +symmetries, deriving conservation laws, and constructing a new loss function. + +
+
+
+
+
+ + ☆ Gaussian Processes for Monitoring Air-Quality in Kampala + + +
+ Monitoring air pollution is of vital importance to the overall health of the +population. Unfortunately, devices that can measure air quality can be +expensive, and many cities in low and middle-income countries have to rely on a +sparse allocation of them. In this paper, we investigate the use of Gaussian +Processes for both nowcasting the current air-pollution in places where there +are no sensors and forecasting the air-pollution in the future at the sensor +locations. In particular, we focus on the city of Kampala in Uganda, using data +from AirQo's network of sensors. We demonstrate the advantage of removing +outliers, compare different kernel functions and additional inputs. We also +compare two sparse approximations to allow for the large amounts of temporal +data in the dataset. + +
+
+
+
+
+ + ☆ Beyond Labels: Advancing Cluster Analysis with the Entropy of Distance + Distribution (EDD) + + +
+ In the evolving landscape of data science, the accurate quantification of +clustering in high-dimensional data sets remains a significant challenge, +especially in the absence of predefined labels. This paper introduces a novel +approach, the Entropy of Distance Distribution (EDD), which represents a +paradigm shift in label-free clustering analysis. Traditional methods, reliant +on discrete labels, often struggle to discern intricate cluster patterns in +unlabeled data. EDD, however, leverages the characteristic differences in +pairwise point-to-point distances to discern clustering tendencies, independent +of data labeling. + Our method employs the Shannon information entropy to quantify the +'peakedness' or 'flatness' of distance distributions in a data set. This +entropy measure, normalized against its maximum value, effectively +distinguishes between strongly clustered data (indicated by pronounced peaks in +distance distribution) and more homogeneous, non-clustered data sets. This +label-free quantification is resilient against global translations and +permutations of data points, and with an additional dimension-wise z-scoring, +it becomes invariant to data set scaling. + We demonstrate the efficacy of EDD through a series of experiments involving +two-dimensional data spaces with Gaussian cluster centers. Our findings reveal +a monotonic increase in the EDD value with the widening of cluster widths, +moving from well-separated to overlapping clusters. This behavior underscores +the method's sensitivity and accuracy in detecting varying degrees of +clustering. EDD's potential extends beyond conventional clustering analysis, +offering a robust, scalable tool for unraveling complex data structures without +reliance on pre-assigned labels. + +
+
+
+
+
+ + ☆ On the Long Range Abilities of Transformers + + +
+ Despite their dominance in modern DL and, especially, NLP domains, +transformer architectures exhibit sub-optimal performance on long-range tasks +compared to recent layers that are specifically designed for this purpose. In +this work, drawing inspiration from key attributes of long-range layers, such +as state-space layers, linear RNN layers, and global convolution layers, we +demonstrate that minimal modifications to the transformer architecture can +significantly enhance performance on the Long Range Arena (LRA) benchmark, thus +narrowing the gap with these specialized layers. We identify that two key +principles for long-range tasks are (i) incorporating an inductive bias towards +smoothness, and (ii) locality. As we show, integrating these ideas into the +attention mechanism improves results with a negligible amount of additional +computation and without any additional trainable parameters. Our theory and +experiments also shed light on the reasons for the inferior performance of +transformers on long-range tasks and identify critical properties that are +essential for successfully capturing long-range dependencies. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Adversarial Distribution Balancing for Counterfactual Reasoning + + +
+ The development of causal prediction models is challenged by the fact that +the outcome is only observable for the applied (factual) intervention and not +for its alternatives (the so-called counterfactuals); in medicine we only know +patients' survival for the administered drug and not for other therapeutic +options. Machine learning approaches for counterfactual reasoning have to deal +with both unobserved outcomes and distributional differences due to non-random +treatment administration. Unsupervised domain adaptation (UDA) addresses +similar issues; one has to deal with unobserved outcomes -- the labels of the +target domain -- and distributional differences between source and target +domain. We propose Adversarial Distribution Balancing for Counterfactual +Reasoning (ADBCR), which directly uses potential outcome estimates of the +counterfactuals to remove spurious causal relations. We show that ADBCR +outcompetes state-of-the-art methods on three benchmark datasets, and +demonstrate that ADBCR's performance can be further improved if unlabeled +validation data are included in the training procedure to better adapt the +model to the validation domain. + +
+
+ comment: Implementation available at https://github.com/sschrod/ADBCR +
+
+
+
+
+ + ☆ A Multivariate Unimodality Test Harnenssing the Dip Statistic of + Mahalanobis Distances Over Random Projections + + +
+ Unimodality, pivotal in statistical analysis, offers insights into dataset +structures and drives sophisticated analytical procedures. While unimodality's +confirmation is straightforward for one-dimensional data using methods like +Silverman's approach and Hartigans' dip statistic, its generalization to higher +dimensions remains challenging. By extrapolating one-dimensional unimodality +principles to multi-dimensional spaces through linear random projections and +leveraging point-to-point distancing, our method, rooted in +$\alpha$-unimodality assumptions, presents a novel multivariate unimodality +test named mud-pod. Both theoretical and empirical studies confirm the efficacy +of our method in unimodality assessment of multidimensional datasets as well as +in estimating the number of clusters. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ☆ Eigenmatrix for unstructured sparse recovery + + +
+ This paper considers the unstructured sparse recovery problems in a general +form. Examples include rational approximation, spectral function estimation, +Fourier inversion, Laplace inversion, and sparse deconvolution. The main +challenges are the noise in the sample values and the unstructured nature of +the sample locations. This paper proposes the eigenmatrix, a data-driven +construction with desired approximate eigenvalues and eigenvectors. The +eigenmatrix offers a new way for these sparse recovery problems. Numerical +results are provided to demonstrate the efficiency of the proposed method. + +
+
+
+
+
+ + ☆ LasTGL: An Industrial Framework for Large-Scale Temporal Graph Learning + + +
+ Over the past few years, graph neural networks (GNNs) have become powerful +and practical tools for learning on (static) graph-structure data. However, +many real-world applications, such as social networks and e-commerce, involve +temporal graphs where nodes and edges are dynamically evolving. Temporal graph +neural networks (TGNNs) have progressively emerged as an extension of GNNs to +address time-evolving graphs and have gradually become a trending research +topic in both academics and industry. Advancing research in such an emerging +field requires new tools to compose TGNN models and unify their different +schemes in dealing with temporal graphs. To facilitate research and application +in temporal graph learning, we introduce LasTGL, an industrial framework that +integrates unified and extensible implementations of common temporal graph +learning algorithms for various advanced tasks. The purpose of LasTGL is to +provide the essential building blocks for solving temporal graph learning +tasks, focusing on the guiding principles of user-friendliness and quick +prototyping on which PyTorch is based. In particular, LasTGL provides +comprehensive temporal graph datasets, TGNN models and utilities along with +well-documented tutorials, making it suitable for both absolute beginners and +expert deep learning practitioners alike. + +
+
+ comment: Preprint; Work in progress +
+
+
+
+
+ + ☆ LC4SV: A Denoising Framework Learning to Compensate for Unseen Speaker + Verification Models + + +
+ The performance of speaker verification (SV) models may drop dramatically in +noisy environments. A speech enhancement (SE) module can be used as a front-end +strategy. However, existing SE methods may fail to bring performance +improvements to downstream SV systems due to artifacts in the predicted signals +of SE models. To compensate for artifacts, we propose a generic denoising +framework named LC4SV, which can serve as a pre-processor for various unknown +downstream SV models. In LC4SV, we employ a learning-based interpolation agent +to automatically generate the appropriate coefficients between the enhanced +signal and its noisy input to improve SV performance in noisy environments. Our +experimental results demonstrate that LC4SV consistently improves the +performance of various unseen SV systems. To the best of our knowledge, this +work is the first attempt to develop a learning-based interpolation scheme +aiming at improving SV performance in noisy environments. + +
+
+
+
+
+ + ☆ GSP-KalmanNet: Tracking Graph Signals via Neural-Aided Kalman Filtering + + +
+ Dynamic systems of graph signals are encountered in various applications, +including social networks, power grids, and transportation. While such systems +can often be described as state space (SS) models, tracking graph signals via +conventional tools based on the Kalman filter (KF) and its variants is +typically challenging. This is due to the nonlinearity, high dimensionality, +irregularity of the domain, and complex modeling associated with real-world +dynamic systems of graph signals. In this work, we study the tracking of graph +signals using a hybrid model-based/data-driven approach. We develop the +GSP-KalmanNet, which tracks the hidden graphical states from the graphical +measurements by jointly leveraging graph signal processing (GSP) tools and deep +learning (DL) techniques. The derivations of the GSP-KalmanNet are based on +extending the KF to exploit the inherent graph structure via graph frequency +domain filtering, which considerably simplifies the computational complexity +entailed in processing high-dimensional signals and increases the robustness to +small topology changes. Then, we use data to learn the Kalman gain following +the recently proposed KalmanNet framework, which copes with partial and +approximated modeling, without forcing a specific model over the noise +statistics. Our empirical results demonstrate that the proposed GSP-KalmanNet +achieves enhanced accuracy and run time performance as well as improved +robustness to model misspecifications compared with both model-based and +data-driven benchmarks. + +
+
+ comment: Submitted for possible publication in the IEEE +
+
+
+
+
+ + ☆ D4AM: A General Denoising Framework for Downstream Acoustic Models + + +
+ The performance of acoustic models degrades notably in noisy environments. +Speech enhancement (SE) can be used as a front-end strategy to aid automatic +speech recognition (ASR) systems. However, existing training objectives of SE +methods are not fully effective at integrating speech-text and noisy-clean +paired data for training toward unseen ASR systems. In this study, we propose a +general denoising framework, D4AM, for various downstream acoustic models. Our +framework fine-tunes the SE model with the backward gradient according to a +specific acoustic model and the corresponding classification objective. In +addition, our method aims to consider the regression objective as an auxiliary +loss to make the SE model generalize to other unseen acoustic models. To +jointly train an SE unit with regression and classification objectives, D4AM +uses an adjustment scheme to directly estimate suitable weighting coefficients +rather than undergoing a grid search process with additional training costs. +The adjustment scheme consists of two parts: gradient calibration and +regression objective weighting. The experimental results show that D4AM can +consistently and effectively provide improvements to various unseen acoustic +models and outperforms other combination setups. Specifically, when evaluated +on the Google ASR API with real noisy data completely unseen during SE +training, D4AM achieves a relative WER reduction of 24.65% compared with the +direct feeding of noisy input. To our knowledge, this is the first work that +deploys an effective combination scheme of regression (denoising) and +classification (ASR) objectives to derive a general pre-processor applicable to +various unseen ASR systems. Our code is available at +https://github.com/ChangLee0903/D4AM. + +
+
+
+
+
+ + ☆ Empowering COVID-19 Detection: Optimizing Performance Through Fine-Tuned + EfficientNet Deep Learning Architecture + + +
+ The worldwide COVID-19 pandemic has profoundly influenced the health and +everyday experiences of individuals across the planet. It is a highly +contagious respiratory disease requiring early and accurate detection to curb +its rapid transmission. Initial testing methods primarily revolved around +identifying the genetic composition of the coronavirus, exhibiting a relatively +low detection rate and requiring a time-intensive procedure. To address this +challenge, experts have suggested using radiological imagery, particularly +chest X-rays, as a valuable approach within the diagnostic protocol. This study +investigates the potential of leveraging radiographic imaging (X-rays) with +deep learning algorithms to swiftly and precisely identify COVID-19 patients. +The proposed approach elevates the detection accuracy by fine-tuning with +appropriate layers on various established transfer learning models. The +experimentation was conducted on a COVID-19 X-ray dataset containing 2000 +images. The accuracy rates achieved were impressive of 100% for EfficientNetB4 +model. The fine-tuned EfficientNetB4 achieved an excellent accuracy score, +showcasing its potential as a robust COVID-19 detection model. Furthermore, +EfficientNetB4 excelled in identifying Lung disease using Chest X-ray dataset +containing 4,350 Images, achieving remarkable performance with an accuracy of +99.17%, precision of 99.13%, recall of 99.16%, and f1-score of 99.14%. These +results highlight the promise of fine-tuned transfer learning for efficient +lung detection through medical imaging, especially with X-ray images. This +research offers radiologists an effective means of aiding rapid and precise +COVID-19 diagnosis and contributes valuable assistance for healthcare +professionals in accurately identifying affected patients. + +
+
+ comment: Computers in Biology and Medicine [Q1, IF: 7.7, CS: 9.2] +
+
+
+
+
+ + ☆ Improving Lane Detection Generalization: A Novel Framework using HD Maps + for Boosting Diversity + + +
+ Lane detection is a vital task for vehicles to navigate and localize their +position on the road. To ensure reliable results, lane detection algorithms +must have robust generalization performance in various road environments. +However, despite the significant performance improvement of deep learning-based +lane detection algorithms, their generalization performance in response to +changes in road environments still falls short of expectations. In this paper, +we present a novel framework for single-source domain generalization (SSDG) in +lane detection. By decomposing data into lane structures and surroundings, we +enhance diversity using High-Definition (HD) maps and generative models. Rather +than expanding data volume, we strategically select a core subset of data, +maximizing diversity and optimizing performance. Our extensive experiments +demonstrate that our framework enhances the generalization performance of lane +detection, comparable to the domain adaptation-based method. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ FedAL: Black-Box Federated Knowledge Distillation Enabled by Adversarial + Learning + + +
+ Knowledge distillation (KD) can enable collaborative learning among +distributed clients that have different model architectures and do not share +their local data and model parameters with others. Each client updates its +local model using the average model output/feature of all client models as the +target, known as federated KD. However, existing federated KD methods often do +not perform well when clients' local models are trained with heterogeneous +local datasets. In this paper, we propose Federated knowledge distillation +enabled by Adversarial Learning (FedAL) to address the data heterogeneity among +clients. First, to alleviate the local model output divergence across clients +caused by data heterogeneity, the server acts as a discriminator to guide +clients' local model training to achieve consensus model outputs among clients +through a min-max game between clients and the discriminator. Moreover, +catastrophic forgetting may happen during the clients' local training and +global knowledge transfer due to clients' heterogeneous local data. Towards +this challenge, we design the less-forgetting regularization for both local +training and global knowledge transfer to guarantee clients' ability to +transfer/learn knowledge to/from others. Experimental results show that FedAL +and its variants achieve higher accuracy than other federated KD baselines. + +
+
+
+
+
+ + ☆ Scalable Label Distribution Learning for Multi-Label Classification + + +
+ Multi-label classification (MLC) refers to the problem of tagging a given +instance with a set of relevant labels. Most existing MLC methods are based on +the assumption that the correlation of two labels in each label pair is +symmetric, which is violated in many real-world scenarios. Moreover, most +existing methods design learning processes associated with the number of +labels, which makes their computational complexity a bottleneck when scaling up +to large-scale output space. To tackle these issues, we propose a novel MLC +learning method named Scalable Label Distribution Learning (SLDL) for +multi-label classification which can describe different labels as distributions +in a latent space, where the label correlation is asymmetric and the dimension +is independent of the number of labels. Specifically, SLDL first converts +labels into continuous distributions within a low-dimensional latent space and +leverages the asymmetric metric to establish the correlation between different +labels. Then, it learns the mapping from the feature space to the latent space, +resulting in the computational complexity is no longer related to the number of +labels. Finally, SLDL leverages a nearest-neighbor-based strategy to decode the +latent representations and obtain the final predictions. Our extensive +experiments illustrate that SLDL can achieve very competitive classification +performances with little computational consumption. + +
+
+
+
+
+ + ☆ Exploring Straighter Trajectories of Flow Matching with Diffusion + Guidance + + +
+ Flow matching as a paradigm of generative model achieves notable success +across various domains. However, existing methods use either multi-round +training or knowledge within minibatches, posing challenges in finding a +favorable coupling strategy for straight trajectories. To address this issue, +we propose a novel approach, Straighter trajectories of Flow Matching +(StraightFM). It straightens trajectories with the coupling strategy guided by +diffusion model from entire distribution level. First, we propose a coupling +strategy to straighten trajectories, creating couplings between image and noise +samples under diffusion model guidance. Second, StraightFM also integrates real +data to enhance training, employing a neural network to parameterize another +coupling process from images to noise samples. StraightFM is jointly optimized +with couplings from above two mutually complementary directions, resulting in +straighter trajectories and enabling both one-step and few-step generation. +Extensive experiments demonstrate that StraightFM yields high quality samples +with fewer step. StraightFM generates visually appealing images with a lower +FID among diffusion and traditional flow matching methods within 5 sampling +steps when trained on pixel space. In the latent space (i.e., Latent +Diffusion), StraightFM achieves a lower KID value compared to existing methods +on the CelebA-HQ 256 dataset in fewer than 10 sampling steps. + +
+
+
+
+
+ + ☆ Communication Efficiency Optimization of Federated Learning for + Computing and Network Convergence of 6G Networks + + +
+ Federated learning effectively addresses issues such as data privacy by +collaborating across participating devices to train global models. However, +factors such as network topology and device computing power can affect its +training or communication process in complex network environments. A new +network architecture and paradigm with computing-measurable, perceptible, +distributable, dispatchable, and manageable capabilities, computing and network +convergence (CNC) of 6G networks can effectively support federated learning +training and improve its communication efficiency. By guiding the participating +devices' training in federated learning based on business requirements, +resource load, network conditions, and arithmetic power of devices, CNC can +reach this goal. In this paper, to improve the communication efficiency of +federated learning in complex networks, we study the communication efficiency +optimization of federated learning for computing and network convergence of 6G +networks, methods that gives decisions on its training process for different +network conditions and arithmetic power of participating devices in federated +learning. The experiments address two architectures that exist for devices in +federated learning and arrange devices to participate in training based on +arithmetic power while achieving optimization of communication efficiency in +the process of transferring model parameters. The results show that the method +we proposed can (1) cope well with complex network situations (2) effectively +balance the delay distribution of participating devices for local training (3) +improve the communication efficiency during the transfer of model parameters +(4) improve the resource utilization in the network. + +
+
+ comment: 13 pages, 11 figures, accepted by Frontiers of Information Technology + & Electronic Engineering +
+
+
+
+
+ + ☆ Federated Learning with Diffusion Models for Privacy-Sensitive Vision + Tasks + + +
+ Diffusion models have shown great potential for vision-related tasks, +particularly for image generation. However, their training is typically +conducted in a centralized manner, relying on data collected from publicly +available sources. This approach may not be feasible or practical in many +domains, such as the medical field, which involves privacy concerns over data +collection. Despite the challenges associated with privacy-sensitive data, such +domains could still benefit from valuable vision services provided by diffusion +models. Federated learning (FL) plays a crucial role in enabling decentralized +model training without compromising data privacy. Instead of collecting data, +an FL system gathers model parameters, effectively safeguarding the private +data of different parties involved. This makes FL systems vital for managing +decentralized learning tasks, especially in scenarios where privacy-sensitive +data is distributed across a network of clients. Nonetheless, FL presents its +own set of challenges due to its distributed nature and privacy-preserving +properties. Therefore, in this study, we explore the FL strategy to train +diffusion models, paving the way for the development of federated diffusion +models. We conduct experiments on various FL scenarios, and our findings +demonstrate that federated diffusion models have great potential to deliver +vision services to privacy-sensitive domains. + +
+
+
+
+
+ + ☆ Personalized Predictions of Glioblastoma Infiltration: Mathematical + Models, Physics-Informed Neural Networks and Multimodal Scans + + +
+ Predicting the infiltration of Glioblastoma (GBM) from medical MRI scans is +crucial for understanding tumor growth dynamics and designing personalized +radiotherapy treatment plans.Mathematical models of GBM growth can complement +the data in the prediction of spatial distributions of tumor cells. However, +this requires estimating patient-specific parameters of the model from clinical +data, which is a challenging inverse problem due to limited temporal data and +the limited time between imaging and diagnosis. This work proposes a method +that uses Physics-Informed Neural Networks (PINNs) to estimate patient-specific +parameters of a reaction-diffusion PDE model of GBM growth from a single 3D +structural MRI snapshot. PINNs embed both the data and the PDE into a loss +function, thus integrating theory and data. Key innovations include the +identification and estimation of characteristic non-dimensional parameters, a +pre-training step that utilizes the non-dimensional parameters and a +fine-tuning step to determine the patient specific parameters. Additionally, +the diffuse domain method is employed to handle the complex brain geometry +within the PINN framework. Our method is validated both on synthetic and +patient datasets, and shows promise for real-time parametric inference in the +clinical setting for personalized GBM treatment. + +
+
+
+
+
+ + ☆ Contrastive encoder pre-training-based clustered federated learning for + heterogeneous data + + +
+ Federated learning (FL) is a promising approach that enables distributed +clients to collaboratively train a global model while preserving their data +privacy. However, FL often suffers from data heterogeneity problems, which can +significantly affect its performance. To address this, clustered federated +learning (CFL) has been proposed to construct personalized models for different +client clusters. One effective client clustering strategy is to allow clients +to choose their own local models from a model pool based on their performance. +However, without pre-trained model parameters, such a strategy is prone to +clustering failure, in which all clients choose the same model. Unfortunately, +collecting a large amount of labeled data for pre-training can be costly and +impractical in distributed environments. To overcome this challenge, we +leverage self-supervised contrastive learning to exploit unlabeled data for the +pre-training of FL systems. Together, self-supervised pre-training and client +clustering can be crucial components for tackling the data heterogeneity issues +of FL. Leveraging these two crucial strategies, we propose contrastive +pre-training-based clustered federated learning (CP-CFL) to improve the model +convergence and overall performance of FL systems. In this work, we demonstrate +the effectiveness of CP-CFL through extensive experiments in heterogeneous FL +settings, and present various interesting observations. + +
+
+ comment: Published in Neural Networks +
+
+
+
+
+ + ☆ Utility Fairness in Contextual Dynamic Pricing with Demand Learning + + +
+ This paper introduces a novel contextual bandit algorithm for personalized +pricing under utility fairness constraints in scenarios with uncertain demand, +achieving an optimal regret upper bound. Our approach, which incorporates +dynamic pricing and demand learning, addresses the critical challenge of +fairness in pricing strategies. We first delve into the static full-information +setting to formulate an optimal pricing policy as a constrained optimization +problem. Here, we propose an approximation algorithm for efficiently and +approximately computing the ideal policy. + We also use mathematical analysis and computational studies to characterize +the structures of optimal contextual pricing policies subject to fairness +constraints, deriving simplified policies which lays the foundations of more +in-depth research and extensions. + Further, we extend our study to dynamic pricing problems with demand +learning, establishing a non-standard regret lower bound that highlights the +complexity added by fairness constraints. Our research offers a comprehensive +analysis of the cost of fairness and its impact on the balance between utility +and revenue maximization. This work represents a step towards integrating +ethical considerations into algorithmic efficiency in data-driven dynamic +pricing. + +
+
+
+
+
+ + ☆ On robust overfitting: adversarial training induced distribution matters + + +
+ Adversarial training may be regarded as standard training with a modified +loss function. But its generalization error appears much larger than standard +training under standard loss. This phenomenon, known as robust overfitting, has +attracted significant research attention and remains largely as a mystery. In +this paper, we first show empirically that robust overfitting correlates with +the increasing generalization difficulty of the perturbation-induced +distributions along the trajectory of adversarial training (specifically +PGD-based adversarial training). We then provide a novel upper bound for +generalization error with respect to the perturbation-induced distributions, in +which a notion of the perturbation operator, referred to "local dispersion", +plays an important role. + +
+
+
+
+
+ + ☆ 3D Teeth Reconstruction from Panoramic Radiographs using Neural Implicit + Functions MICCAI 2023 + + +
+ Panoramic radiography is a widely used imaging modality in dental practice +and research. However, it only provides flattened 2D images, which limits the +detailed assessment of dental structures. In this paper, we propose Occudent, a +framework for 3D teeth reconstruction from panoramic radiographs using neural +implicit functions, which, to the best of our knowledge, is the first work to +do so. For a given point in 3D space, the implicit function estimates whether +the point is occupied by a tooth, and thus implicitly determines the boundaries +of 3D tooth shapes. Firstly, Occudent applies multi-label segmentation to the +input panoramic radiograph. Next, tooth shape embeddings as well as tooth class +embeddings are generated from the segmentation outputs, which are fed to the +reconstruction network. A novel module called Conditional eXcitation (CX) is +proposed in order to effectively incorporate the combined shape and class +embeddings into the implicit function. The performance of Occudent is evaluated +using both quantitative and qualitative measures. Importantly, Occudent is +trained and validated with actual panoramic radiographs as input, distinct from +recent works which used synthesized images. Experiments demonstrate the +superiority of Occudent over state-of-the-art methods. + +
+
+ comment: 12 pages, 2 figures, accepted to International Conference on Medical + Image Computing and Computer-Assisted Intervention MICCAI 2023 +
+
+
+
+
+ + ☆ Evaluation of dynamic characteristics of power grid based on GNN and + application on knowledge graph + + +
+ A novel method for detecting faults in power grids using a graph neural +network (GNN) has been developed, aimed at enhancing intelligent fault +diagnosis in network operation and maintenance. This GNN-based approach +identifies faulty nodes within the power grid through a specialized electrical +feature extraction model coupled with a knowledge graph. Incorporating temporal +data, the method leverages the status of nodes from preceding and subsequent +time periods to aid in current fault detection. To validate the effectiveness +of this GNN in extracting node features, a correlation analysis of the output +features from each node within the neural network layer was conducted. The +results from experiments show that this method can accurately locate fault +nodes in simulated scenarios with a remarkable 99.53% accuracy. Additionally, +the graph neural network's feature modeling allows for a qualitative +examination of how faults spread across nodes, providing valuable insights for +analyzing fault nodes. + +
+
+
+
+
+ + ☆ Value Approximation for Two-Player General-Sum Differential Games with + State Constraints + + +
+ Solving Hamilton-Jacobi-Isaacs (HJI) PDEs enables equilibrial feedback +control in two-player differential games, yet faces the curse of dimensionality +(CoD). While physics-informed machine learning has been adopted to address CoD +in solving PDEs, this method falls short in learning discontinuous solutions +due to its sampling nature, leading to poor safety performance of the resulting +controllers in robotics applications where values are discontinuous due to +state or other temporal logic constraints. In this study, we explore three +potential solutions to this problem: (1) a hybrid learning method that uses +both equilibrium demonstrations and the HJI PDE, (2) a value-hardening method +where a sequence of HJIs are solved with increasing Lipschitz constant on the +constraint violation penalty, and (3) the epigraphical technique that lifts the +value to a higher dimensional auxiliary state space where the value becomes +continuous. Evaluations through 5D and 9D vehicle simulations and 13D drone +simulations reveal that the hybrid method outperforms others in terms of +generalization and safety performance. + +
+
+ comment: Submitted to TRO +
+
+
+
+
+ + ☆ B-LSTM-MIONet: Bayesian LSTM-based Neural Operators for Learning the + Response of Complex Dynamical Systems to Length-Variant Multiple Input + Functions + + +
+ Deep Operator Network (DeepONet) is a neural network framework for learning +nonlinear operators such as those from ordinary differential equations (ODEs) +describing complex systems. Multiple-input deep neural operators (MIONet) +extended DeepONet to allow multiple input functions in different Banach spaces. +MIONet offers flexibility in training dataset grid spacing, without constraints +on output location. However, it requires offline inputs and cannot handle +varying sequence lengths in testing datasets, limiting its real-time +application in dynamic complex systems. This work redesigns MIONet, integrating +Long Short Term Memory (LSTM) to learn neural operators from time-dependent +data. This approach overcomes data discretization constraints and harnesses +LSTM's capability with variable-length, real-time data. Factors affecting +learning performance, like algorithm extrapolation ability are presented. The +framework is enhanced with uncertainty quantification through a novel Bayesian +method, sampling from MIONet parameter distributions. Consequently, we develop +the B-LSTM-MIONet, incorporating LSTM's temporal strengths with Bayesian +robustness, resulting in a more precise and reliable model for noisy datasets. + +
+
+
+
+
+ + ☆ StyleCap: Automatic Speaking-Style Captioning from Speech Based on + Speech and Language Self-supervised Learning Models ICASSP 2024 + + +
+ We propose StyleCap, a method to generate natural language descriptions of +speaking styles appearing in speech. Although most of conventional techniques +for para-/non-linguistic information recognition focus on the category +classification or the intensity estimation of pre-defined labels, they cannot +provide the reasoning of the recognition result in an interpretable manner. As +a first step towards an end-to-end method for generating speaking-style prompts +from speech, i.e., automatic speaking-style captioning, StyleCap uses paired +data of speech and natural language descriptions to train neural networks that +predict prefix vectors fed into a large language model (LLM)-based text decoder +from a speech representation vector. We explore an appropriate text decoder and +speech feature representation suitable for this new task. The experimental +results demonstrate that our StyleCap leveraging richer LLMs for the text +decoder, speech self-supervised learning (SSL) features, and sentence +rephrasing augmentation improves the accuracy and diversity of generated +speaking-style captions. Samples of speaking-style captions generated by our +StyleCap are publicly available. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ On the Robustness of Decision-Focused Learning AAAI + + +
+ Decision-Focused Learning (DFL) is an emerging learning paradigm that tackles +the task of training a machine learning (ML) model to predict missing +parameters of an incomplete optimization problem, where the missing parameters +are predicted. DFL trains an ML model in an end-to-end system, by integrating +the prediction and optimization tasks, providing better alignment of the +training and testing objectives. DFL has shown a lot of promise and holds the +capacity to revolutionize decision-making in many real-world applications. +However, very little is known about the performance of these models under +adversarial attacks. We adopt ten unique DFL methods and benchmark their +performance under two distinctly focused attacks adapted towards the +Predict-then-Optimize problem setting. Our study proposes the hypothesis that +the robustness of a model is highly correlated with its ability to find +predictions that lead to optimal decisions without deviating from the +ground-truth label. Furthermore, we provide insight into how to target the +models that violate this condition and show how these models respond +differently depending on the achieved optimality at the end of their training +cycles. + +
+
+ comment: 17 pages, 45 figures, submitted to AAAI artificial intelligence for + operations research workshop +
+
+
+
+
+ + ☆ On the Effect of Defections in Federated Learning and How to Prevent + Them + + +
+ Federated learning is a machine learning protocol that enables a large +population of agents to collaborate over multiple rounds to produce a single +consensus model. There are several federated learning applications where agents +may choose to defect permanently$-$essentially withdrawing from the +collaboration$-$if they are content with their instantaneous model in that +round. This work demonstrates the detrimental impact of such defections on the +final model's robustness and ability to generalize. We also show that current +federated optimization algorithms fail to disincentivize these harmful +defections. We introduce a novel optimization algorithm with theoretical +guarantees to prevent defections while ensuring asymptotic convergence to an +effective solution for all participating agents. We also provide numerical +experiments to corroborate our findings and demonstrate the effectiveness of +our algorithm. + +
+
+
+
+
+ + ☆ Enabling Fast 2-bit LLM on GPUs: Memory Alignment, Sparse Outlier, and + Asynchronous Dequantization + + +
+ Large language models (LLMs) have demonstrated impressive abilities in +various domains while the inference cost is expensive. The state-of-the-art +methods use 2-bit quantization for mainstream LLMs. However, challenges still +exist: (1) Nonnegligible accuracy loss for 2-bit quantization. Weights are +quantized by groups, while the ranges of weights are large in some groups, +resulting in large quantization errors and nonnegligible accuracy loss (e.g. +>3% for Llama2-7b with 2-bit quantization in GPTQ and Greenbit). (2) Limited +accuracy improvement by adding 4-bit weights. Increasing 10% extra average bit +more 4-bit weights only leads to <0.5% accuracy improvement on a quantized +Llama2-7b. (3) Time-consuming dequantization operations on GPUs. The +dequantization operations lead to >50% execution time, hindering the potential +of reducing LLM inference cost. To tackle these challenges, we propose the +following techniques: (1) We only quantize a small fraction of groups with the +larger range using 4-bit with memory alignment consideration on GPUs. (2) We +point out that the distribution of the sparse outliers with larger weights is +different in 2-bit and 4-bit groups, and only a small fraction of outliers +require 16-bit quantization. Such design leads to >0.5% accuracy improvement +with <3% average increased bit for Llama2-7b. (3) We design the asynchronous +dequantization on GPUs, leading to up to 3.92X speedup. We conduct extensive +experiments on different model families and model sizes. We achieve 2.85-bit +for each weight and the end-to-end speedup for Llama2-7b is 1.74X over the +original model, and we reduce both runtime cost and hardware cost by up to +2.70X and 2.81X with less GPU requirements. + +
+
+
+
+
+ + ☆ Text-Driven Image Editing via Learnable Regions + + +
+ Language has emerged as a natural interface for image editing. In this paper, +we introduce a method for region-based image editing driven by textual prompts, +without the need for user-provided masks or sketches. Specifically, our +approach leverages an existing pretrained text-to-image model and introduces a +bounding box generator to find the edit regions that are aligned with the +textual prompts. We show that this simple approach enables flexible editing +that is compatible with current image generation models, and is able to handle +complex prompts featuring multiple objects, complex sentences or long +paragraphs. We conduct an extensive user study to compare our method against +state-of-the-art methods. Experiments demonstrate the competitive performance +of our method in manipulating images with high fidelity and realism that align +with the language descriptions provided. Our project webpage: +https://yuanze-lin.me/LearnableRegions_page. + +
+
+ comment: Project webpage: https://yuanze-lin.me/LearnableRegions_page +
+
+
+
+
+ + ☆ Manifold Preserving Guided Diffusion + + +
+ Despite the recent advancements, conditional image generation still faces +challenges of cost, generalizability, and the need for task-specific training. +In this paper, we propose Manifold Preserving Guided Diffusion (MPGD), a +training-free conditional generation framework that leverages pretrained +diffusion models and off-the-shelf neural networks with minimal additional +inference cost for a broad range of tasks. Specifically, we leverage the +manifold hypothesis to refine the guided diffusion steps and introduce a +shortcut algorithm in the process. We then propose two methods for on-manifold +training-free guidance using pre-trained autoencoders and demonstrate that our +shortcut inherently preserves the manifolds when applied to latent diffusion +models. Our experiments show that MPGD is efficient and effective for solving a +variety of conditional generation applications in low-compute settings, and can +consistently offer up to 3.8x speed-ups with the same number of diffusion steps +while maintaining high sample quality compared to the baselines. + +
+
+
+
+
+ + ☆ Model-free Test Time Adaptation for Out-Of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection is essential for the reliability of ML +models. Most existing methods for OOD detection learn a fixed decision +criterion from a given in-distribution dataset and apply it universally to +decide if a data point is OOD. Recent work~\cite{fang2022is} shows that given +only in-distribution data, it is impossible to reliably detect OOD data without +extra assumptions. Motivated by the theoretical result and recent exploration +of test-time adaptation methods, we propose a Non-Parametric Test Time +\textbf{Ada}ptation framework for \textbf{O}ut-Of-\textbf{D}istribution +\textbf{D}etection (\abbr). Unlike conventional methods, \abbr utilizes online +test samples for model adaptation during testing, enhancing adaptability to +changing data distributions. The framework incorporates detected OOD instances +into decision-making, reducing false positive rates, particularly when ID and +OOD distributions overlap significantly. We demonstrate the effectiveness of +\abbr through comprehensive experiments on multiple OOD detection benchmarks, +extensive empirical studies show that \abbr significantly improves the +performance of OOD detection over state-of-the-art methods. Specifically, \abbr +reduces the false positive rate (FPR95) by $23.23\%$ on the CIFAR-10 benchmarks +and $38\%$ on the ImageNet-1k benchmarks compared to the advanced methods. +Lastly, we theoretically verify the effectiveness of \abbr. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ A Combinatorial Approach to Robust PCA + + +
+ We study the problem of recovering Gaussian data under adversarial +corruptions when the noises are low-rank and the corruptions are on the +coordinate level. Concretely, we assume that the Gaussian noises lie in an +unknown $k$-dimensional subspace $U \subseteq \mathbb{R}^d$, and $s$ randomly +chosen coordinates of each data point fall into the control of an adversary. +This setting models the scenario of learning from high-dimensional yet +structured data that are transmitted through a highly-noisy channel, so that +the data points are unlikely to be entirely clean. + Our main result is an efficient algorithm that, when $ks^2 = O(d)$, recovers +every single data point up to a nearly-optimal $\ell_1$ error of $\tilde +O(ks/d)$ in expectation. At the core of our proof is a new analysis of the +well-known Basis Pursuit (BP) method for recovering a sparse signal, which is +known to succeed under additional assumptions (e.g., incoherence or the +restricted isometry property) on the underlying subspace $U$. In contrast, we +present a novel approach via studying a natural combinatorial problem and show +that, over the randomness in the support of the sparse signal, a +high-probability error bound is possible even if the subspace $U$ is arbitrary. + +
+
+ comment: To appear at ITCS 2024 +
+
+
+
+
+ + ☆ Deep Learning for Time Series Classification of Parkinson's Disease Eye + Tracking Data ML4H + + +
+ Eye-tracking is an accessible and non-invasive technology that provides +information about a subject's motor and cognitive abilities. As such, it has +proven to be a valuable resource in the study of neurodegenerative diseases +such as Parkinson's disease. Saccade experiments, in particular, have proven +useful in the diagnosis and staging of Parkinson's disease. However, to date, +no single eye-movement biomarker has been found to conclusively differentiate +patients from healthy controls. In the present work, we investigate the use of +state-of-the-art deep learning algorithms to perform Parkinson's disease +classification using eye-tracking data from saccade experiments. In contrast to +previous work, instead of using hand-crafted features from the saccades, we use +raw $\sim1.5\,s$ long fixation intervals recorded during the preparatory phase +before each trial. Using these short time series as input we implement two +different classification models, InceptionTime and ROCKET. We find that the +models are able to learn the classification task and generalize to unseen +subjects. InceptionTime achieves $78\%$ accuracy, while ROCKET achieves $88\%$ +accuracy. We also employ a novel method for pruning the ROCKET model to improve +interpretability and generalizability, achieving an accuracy of $96\%$. Our +results suggest that fixation data has low inter-subject variability and +potentially carries useful information about brain cognitive and motor +conditions, making it suitable for use with machine learning in the discovery +of disease-relevant biomarkers. + +
+
+ comment: Extended Abstract presented at Machine Learning for Health (ML4H) + symposium 2023, December 10th, 2023, New Orleans, United States, 12 pages +
+
+
+
+
+ + ☆ LiveTune: Dynamic Parameter Tuning for Training Deep Neural Networks + + +
+ Traditional machine learning training is a static process that lacks +real-time adaptability of hyperparameters. Popular tuning solutions during +runtime involve checkpoints and schedulers. Adjusting hyper-parameters usually +require the program to be restarted, wasting utilization and time, while +placing unnecessary strain on memory and processors. We present LiveTune, a new +framework allowing real-time parameter tuning during training through +LiveVariables. Live Variables allow for a continuous training session by +storing parameters on designated ports on the system, allowing them to be +dynamically adjusted. Extensive evaluations of our framework show saving up to +60 seconds and 5.4 Kilojoules of energy per hyperparameter change. + +
+
+
+
+
+ + ☆ An Online Optimization-Based Decision Support Tool for Small Farmers in + India: Learning in Non-stationary Environments + + +
+ Crop management decision support systems are specialized tools for farmers +that reduce the riskiness of revenue streams, especially valuable for use under +the current climate changes that impact agricultural productivity. +Unfortunately, small farmers in India, who could greatly benefit from these +tools, do not have access to them. In this paper, we model an individual +greenhouse as a Markov Decision Process (MDP) and adapt Li and Li (2019)'s +Follow the Weighted Leader (FWL) online learning algorithm to offer crop +planning advice. We successfully produce utility-preserving cropping pattern +suggestions in simulations. When we compare against an offline planning +algorithm, we achieve the same cumulative revenue with greatly reduced runtime. + +
+
+
+
+
+ + ☆ SoUnD Framework: Analyzing (So)cial Representation in (Un)structured + (D)ata + + +
+ The unstructured nature of data used in foundation model development is a +challenge to systematic analyses for making data use and documentation +decisions. From a Responsible AI perspective, these decisions often rely upon +understanding how people are represented in data. We propose a framework +designed to guide analysis of human representation in unstructured data and +identify downstream risks. We apply the framework in two toy examples using the +Common Crawl web text corpus (C4) and LAION-400M. We also propose a set of +hypothetical action steps in service of dataset use, development, and +documentation. + +
+
+
+
+
+ + ☆ Fourier Neural Differential Equations for learning Quantum Field + Theories + + +
+ A Quantum Field Theory is defined by its interaction Hamiltonian, and linked +to experimental data by the scattering matrix. The scattering matrix is +calculated as a perturbative series, and represented succinctly as a first +order differential equation in time. Neural Differential Equations (NDEs) learn +the time derivative of a residual network's hidden state, and have proven +efficacy in learning differential equations with physical constraints. Hence +using an NDE to learn particle scattering matrices presents a possible +experiment-theory phenomenological connection. In this paper, NDE models are +used to learn $\phi^4$ theory, Scalar-Yukawa theory and Scalar Quantum +Electrodynamics. A new NDE architecture is also introduced, the Fourier Neural +Differential Equation (FNDE), which combines NDE integration and Fourier +network convolution. The FNDE model demonstrates better generalisability than +the non-integrated equivalent FNO model. It is also shown that by training on +scattering data, the interaction Hamiltonian of a theory can be extracted from +network parameters. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Quantifying the redundancy between prosody and text EMNLP + + +
+ Prosody -- the suprasegmental component of speech, including pitch, loudness, +and tempo -- carries critical aspects of meaning. However, the relationship +between the information conveyed by prosody vs. by the words themselves remains +poorly understood. We use large language models (LLMs) to estimate how much +information is redundant between prosody and the words themselves. Using a +large spoken corpus of English audiobooks, we extract prosodic features aligned +to individual words and test how well they can be predicted from LLM +embeddings, compared to non-contextual word embeddings. We find a high degree +of redundancy between the information carried by the words and prosodic +information across several prosodic features, including intensity, duration, +pauses, and pitch contours. Furthermore, a word's prosodic information is +redundant with both the word itself and the context preceding as well as +following it. Still, we observe that prosodic features can not be fully +predicted from text, suggesting that prosody carries information above and +beyond the words. Along with this paper, we release a general-purpose data +processing pipeline for quantifying the relationship between linguistic +information and extra-linguistic features. + +
+
+ comment: Published at The 2023 Conference on Empirical Methods in Natural + Language Processing (EMNLP) +
+
+
+
+
+ + ☆ Invariance assumptions for class distribution estimation + + +
+ We study the problem of class distribution estimation under dataset shift. On +the training dataset, both features and class labels are observed while on the +test dataset only the features can be observed. The task then is the estimation +of the distribution of the class labels, i.e. the estimation of the class prior +probabilities, in the test dataset. Assumptions of invariance between the +training joint distribution of features and labels and the test distribution +can considerably facilitate this task. We discuss the assumptions of covariate +shift, factorizable joint shift, and sparse joint shift and their implications +for class distribution estimation. + +
+
+ comment: 16 pages, presented at workshop Learning to Quantify: Methods and + Applications (LQ 2023), Torino, September 18, 2023 +
+
+
+
+
+ + ☆ BIM: Block-Wise Self-Supervised Learning with Masked Image Modeling + + +
+ Like masked language modeling (MLM) in natural language processing, masked +image modeling (MIM) aims to extract valuable insights from image patches to +enhance the feature extraction capabilities of the underlying deep neural +network (DNN). Contrasted with other training paradigms like supervised +learning and unsupervised contrastive learning, masked image modeling (MIM) +pretraining typically demands significant computational resources in order to +manage large training data batches (e.g., 4096). The significant memory and +computation requirements pose a considerable challenge to its broad adoption. +To mitigate this, we introduce a novel learning framework, +termed~\textit{Block-Wise Masked Image Modeling} (BIM). This framework involves +decomposing the MIM tasks into several sub-tasks with independent computation +patterns, resulting in block-wise back-propagation operations instead of the +traditional end-to-end approach. Our proposed BIM maintains superior +performance compared to conventional MIM while greatly reducing peak memory +consumption. Moreover, BIM naturally enables the concurrent training of +numerous DNN backbones of varying depths. This leads to the creation of +multiple trained DNN backbones, each tailored to different hardware platforms +with distinct computing capabilities. This approach significantly reduces +computational costs in comparison with training each DNN backbone individually. +Our framework offers a promising solution for resource constrained training of +MIM. + +
+
+
+
+
+ + ☆ Optimal EEG Electrode Set for Emotion Recognition From Brain Signals: An + Empirical Quest + + +
+ The human brain is a complex organ, still completely undiscovered, that +controls almost all the parts of the body. Apart from survival, the human brain +stimulates emotions. Recent research indicates that brain signals can be very +effective for emotion recognition. However, which parts of the brain exhibit +most of the emotions is still under-explored. In this study, we empirically +analyze the contribution of each part of the brain in exhibiting emotions. We +use the DEAP dataset to find the most optimal electrode set which eventually +leads to the effective brain part associated with emotions. We use Fast Fourier +Transformation for effective feature extraction and a 1D-CNN with residual +connection for classification. Though 32 electrodes from the DEAP dataset got +an accuracy of 97.34%, only 12 electrodes (F7, P8, O1, F8, C4, T7, PO3, Fp1, +Fp2, O2, P3, and Fz) achieve 95.81% accuracy. This study also shows that adding +more than 10 electrodes does not improve performance significantly. Moreover, +the frontal lobe is the most important for recognizing emotion. + +
+
+
+
+
+ + ☆ Minimax Exploiter: A Data Efficient Approach for Competitive Self-Play + + +
+ Recent advances in Competitive Self-Play (CSP) have achieved, or even +surpassed, human level performance in complex game environments such as Dota 2 +and StarCraft II using Distributed Multi-Agent Reinforcement Learning (MARL). +One core component of these methods relies on creating a pool of learning +agents -- consisting of the Main Agent, past versions of this agent, and +Exploiter Agents -- where Exploiter Agents learn counter-strategies to the Main +Agents. A key drawback of these approaches is the large computational cost and +physical time that is required to train the system, making them impractical to +deploy in highly iterative real-life settings such as video game productions. +In this paper, we propose the Minimax Exploiter, a game theoretic approach to +exploiting Main Agents that leverages knowledge of its opponents, leading to +significant increases in data efficiency. We validate our approach in a +diversity of settings, including simple turn based games, the arcade learning +environment, and For Honor, a modern video game. The Minimax Exploiter +consistently outperforms strong baselines, demonstrating improved stability and +data efficiency, leading to a robust CSP-MARL method that is both flexible and +easy to deploy. + +
+
+
+
+
+ + ☆ SatCLIP: Global, General-Purpose Location Embeddings with Satellite + Imagery + + +
+ Geographic location is essential for modeling tasks in fields ranging from +ecology to epidemiology to the Earth system sciences. However, extracting +relevant and meaningful characteristics of a location can be challenging, often +entailing expensive data fusion or data distillation from global imagery +datasets. To address this challenge, we introduce Satellite Contrastive +Location-Image Pretraining (SatCLIP), a global, general-purpose geographic +location encoder that learns an implicit representation of locations from +openly available satellite imagery. Trained location encoders provide vector +embeddings summarizing the characteristics of any given location for convenient +usage in diverse downstream tasks. We show that SatCLIP embeddings, pretrained +on globally sampled multi-spectral Sentinel-2 satellite data, can be used in +various predictive tasks that depend on location information but not +necessarily satellite imagery, including temperature prediction, animal +recognition in imagery, and population density estimation. Across tasks, +SatCLIP embeddings consistently outperform embeddings from existing pretrained +location encoders, ranging from models trained on natural images to models +trained on semantic context. SatCLIP embeddings also help to improve geographic +generalization. This demonstrates the potential of general-purpose location +encoders and opens the door to learning meaningful representations of our +planet from the vast, varied, and largely untapped modalities of geospatial +data. + +
+
+
+
+
+ + ☆ A personalized Uncertainty Quantification framework for patient survival + models: estimating individual uncertainty of patients with metastatic brain + tumors in the absence of ground truth + + +
+ TodevelopanovelUncertaintyQuantification (UQ) framework to estimate the +uncertainty of patient survival models in the absence of ground truth, we +developed and evaluated our approach based on a dataset of 1383 patients +treated with stereotactic radiosurgery (SRS) for brain metastases between +January 2015 and December 2020. Our motivating hypothesis is that a +time-to-event prediction of a test patient on inference is more certain given a +higher feature-space-similarity to patients in the training set. Therefore, the +uncertainty for a particular patient-of-interest is represented by the +concordance index between a patient similarity rank and a prediction similarity +rank. Model uncertainty was defined as the increased percentage of the max +uncertainty-constrained-AUC compared to the model AUC. We evaluated our method +on multiple clinically-relevant endpoints, including time to intracranial +progression (ICP), progression-free survival (PFS) after SRS, overall survival +(OS), and time to ICP and/or death (ICPD), on a variety of both statistical and +non-statistical models, including CoxPH, conditional survival forest (CSF), and +neural multi-task linear regression (NMTLR). Our results show that all models +had the lowest uncertainty on ICP (2.21%) and the highest uncertainty (17.28%) +on ICPD. OS models demonstrated high variation in uncertainty performance, +where NMTLR had the lowest uncertainty(1.96%)and CSF had the highest +uncertainty (14.29%). In conclusion, our method can estimate the uncertainty of +individual patient survival modeling results. As expected, our data empirically +demonstrate that as model uncertainty measured via our technique increases, the +similarity between a feature-space and its predicted outcome decreases. + +
+
+
+
+
+ + ♻ ☆ Generative Social Choice + + +
+ Traditionally, social choice theory has only been applicable to choices among +a few predetermined alternatives but not to more complex decisions such as +collectively selecting a textual statement. We introduce generative social +choice, a framework that combines the mathematical rigor of social choice +theory with the capability of large language models to generate text and +extrapolate preferences. This framework divides the design of AI-augmented +democratic processes into two components: first, proving that the process +satisfies rigorous representation guarantees when given access to oracle +queries; second, empirically validating that these queries can be approximately +implemented using a large language model. We apply this framework to the +problem of generating a slate of statements that is representative of opinions +expressed as free-form text; specifically, we develop a democratic process with +representation guarantees and use this process to represent the opinions of +participants in a survey about chatbot personalization. We find that 93 out of +100 participants feel "mostly" or "perfectly" represented by the slate of five +statements we extracted. + +
+
+ comment: Substantially revised with non-approval utility model, new + representation axiom (balanced justified representation), and real-world case + study +
+
+
+
+
+ + ♻ ☆ Forward Gradients for Data-Driven CFD Wall Modeling + + +
+ Computational Fluid Dynamics (CFD) is used in the design and optimization of +gas turbines and many other industrial/ scientific applications. However, the +practical use is often limited by the high computational cost, and the accurate +resolution of near-wall flow is a significant contributor to this cost. Machine +learning (ML) and other data-driven methods can complement existing wall +models. Nevertheless, training these models is bottlenecked by the large +computational effort and memory footprint demanded by back-propagation. Recent +work has presented alternatives for computing gradients of neural networks +where a separate forward and backward sweep is not needed and storage of +intermediate results between sweeps is not required because an unbiased +estimator for the gradient is computed in a single forward sweep. In this +paper, we discuss the application of this approach for training a subgrid wall +model that could potentially be used as a surrogate in wall-bounded flow CFD +simulations to reduce the computational overhead while preserving predictive +accuracy. + +
+
+
+
+
+ + ♻ ☆ Edge Directionality Improves Learning on Heterophilic Graphs + + +
+ Graph Neural Networks (GNNs) have become the de-facto standard tool for +modeling relational data. However, while many real-world graphs are directed, +the majority of today's GNN models discard this information altogether by +simply making the graph undirected. The reasons for this are historical: 1) +many early variants of spectral GNNs explicitly required undirected graphs, and +2) the first benchmarks on homophilic graphs did not find significant gain from +using direction. In this paper, we show that in heterophilic settings, treating +the graph as directed increases the effective homophily of the graph, +suggesting a potential gain from the correct use of directionality information. +To this end, we introduce Directed Graph Neural Network (Dir-GNN), a novel +general framework for deep learning on directed graphs. Dir-GNN can be used to +extend any Message Passing Neural Network (MPNN) to account for edge +directionality information by performing separate aggregations of the incoming +and outgoing edges. We prove that Dir-GNN matches the expressivity of the +Directed Weisfeiler-Lehman test, exceeding that of conventional MPNNs. In +extensive experiments, we validate that while our framework leaves performance +unchanged on homophilic datasets, it leads to large gains over base models such +as GCN, GAT and GraphSage on heterophilic benchmarks, outperforming much more +complex methods and achieving new state-of-the-art results. + +
+
+
+
+
+ + ♻ ☆ H-Packer: Holographic Rotationally Equivariant Convolutional Neural + Network for Protein Side-Chain Packing + + +
+ Accurately modeling protein 3D structure is essential for the design of +functional proteins. An important sub-task of structure modeling is protein +side-chain packing: predicting the conformation of side-chains (rotamers) given +the protein's backbone structure and amino-acid sequence. Conventional +approaches for this task rely on expensive sampling procedures over +hand-crafted energy functions and rotamer libraries. Recently, several deep +learning methods have been developed to tackle the problem in a data-driven +way, albeit with vastly different formulations (from image-to-image translation +to directly predicting atomic coordinates). Here, we frame the problem as a +joint regression over the side-chains' true degrees of freedom: the dihedral +$\chi$ angles. We carefully study possible objective functions for this task, +while accounting for the underlying symmetries of the task. We propose +Holographic Packer (H-Packer), a novel two-stage algorithm for side-chain +packing built on top of two light-weight rotationally equivariant neural +networks. We evaluate our method on CASP13 and CASP14 targets. H-Packer is +computationally efficient and shows favorable performance against conventional +physics-based algorithms and is competitive against alternative deep learning +solutions. + +
+
+ comment: Accepted as a conference paper at MLCB 2023. 8 pages main body, 20 + pages with appendix. 10 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Bayesian Learning with Action and State-Dependent Signal + Variance + + +
+ This manuscript presents an advanced framework for Bayesian learning by +incorporating action and state-dependent signal variances into decision-making +models. This framework is pivotal in understanding complex data-feedback loops +and decision-making processes in various economic systems. Through a series of +examples, we demonstrate the versatility of this approach in different +contexts, ranging from simple Bayesian updating in stable environments to +complex models involving social learning and state-dependent uncertainties. The +paper uniquely contributes to the understanding of the nuanced interplay +between data, actions, outcomes, and the inherent uncertainty in economic +models. + +
+
+
+
+
+ + ♻ ☆ Towards Responsible Governance of Biological Design Tools NeurIPS 2023 + + +
+ Recent advancements in generative machine learning have enabled rapid +progress in biological design tools (BDTs) such as protein structure and +sequence prediction models. The unprecedented predictive accuracy and novel +design capabilities of BDTs present new and significant dual-use risks. For +example, their predictive accuracy allows biological agents, whether vaccines +or pathogens, to be developed more quickly, while the design capabilities could +be used to discover drugs or evade DNA screening techniques. Similar to other +dual-use AI systems, BDTs present a wicked problem: how can regulators uphold +public safety without stifling innovation? We highlight how current regulatory +proposals that are primarily tailored toward large language models may be less +effective for BDTs, which require fewer computational resources to train and +are often developed in an open-source manner. We propose a range of measures to +mitigate the risk that BDTs are misused, across the areas of responsible +development, risk assessment, transparency, access management, cybersecurity, +and investing in resilience. Implementing such measures will require close +coordination between developers and governments. + +
+
+ comment: 10 pages + references, 1 figure, accepted at NeurIPS 2023 Workshop on + Regulatable ML as oral presentation +
+
+
+
+
+ + ♻ ☆ Diffusion Models for Interferometric Satellite Aperture Radar + + +
+ Probabilistic Diffusion Models (PDMs) have recently emerged as a very +promising class of generative models, achieving high performance in natural +image generation. However, their performance relative to non-natural images, +like radar-based satellite data, remains largely unknown. Generating large +amounts of synthetic (and especially labelled) satellite data is crucial to +implement deep-learning approaches for the processing and analysis of +(interferometric) satellite aperture radar data. Here, we leverage PDMs to +generate several radar-based satellite image datasets. We show that PDMs +succeed in generating images with complex and realistic structures, but that +sampling time remains an issue. Indeed, accelerated sampling strategies, which +work well on simple image datasets like MNIST, fail on our radar datasets. We +provide a simple and versatile open-source +https://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and +evaluate PDMs using any dataset on a single GPU. + +
+
+
+
+
+ + ♻ ☆ T-Rep: Representation Learning for Time Series using Time-Embeddings ICLR 2024 + + +
+ Multivariate time series present challenges to standard machine learning +techniques, as they are often unlabeled, high dimensional, noisy, and contain +missing data. To address this, we propose T-Rep, a self-supervised method to +learn time series representations at a timestep granularity. T-Rep learns +vector embeddings of time alongside its feature extractor, to extract temporal +features such as trend, periodicity, or distribution shifts from the signal. +These time-embeddings are leveraged in pretext tasks, to incorporate smooth and +fine-grained temporal dependencies in the representations, as well as reinforce +robustness to missing data. We evaluate T-Rep on downstream classification, +forecasting, and anomaly detection tasks. It is compared to existing +self-supervised algorithms for time series, which it outperforms in all three +tasks. We test T-Rep in missing data regimes, where it proves more resilient +than its counterparts. Finally, we provide latent space visualisation +experiments, highlighting the interpretability of the learned representations. + +
+
+ comment: Under review at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Antenna Response Consistency Driven Self-supervised Learning for + WIFI-based Human Activity Recognition + + +
+ Self-supervised learning (SSL) for WiFi-based human activity recognition +(HAR) holds great promise due to its ability to address the challenge of +insufficient labeled data. However, directly transplanting SSL algorithms, +especially contrastive learning, originally designed for other domains to CSI +data, often fails to achieve the expected performance. We attribute this issue +to the inappropriate alignment criteria, which disrupt the semantic distance +consistency between the feature space and the input space. To address this +challenge, we introduce \textbf{A}ntenna \textbf{R}esponse \textbf{C}onsistency +(ARC) as a solution to define proper alignment criteria. ARC is designed to +retain semantic information from the input space while introducing robustness +to real-world noise. Moreover, we substantiate the effectiveness of ARC through +a comprehensive set of experiments, demonstrating its capability to enhance the +performance of self-supervised learning for WiFi-based HAR by achieving an +increase of over 5\% in accuracy in most cases and achieving a best accuracy of +94.97\%. + +
+
+
+
+
+ + ♻ ☆ Fantastic Generalization Measures are Nowhere to be Found + + +
+ We study the notion of a generalization bound being uniformly tight, meaning +that the difference between the bound and the population loss is small for all +learning algorithms and all population distributions. Numerous generalization +bounds have been proposed in the literature as potential explanations for the +ability of neural networks to generalize in the overparameterized setting. +However, in their paper ``Fantastic Generalization Measures and Where to Find +Them,'' Jiang et al. (2020) examine more than a dozen generalization bounds, +and show empirically that none of them are uniformly tight. This raises the +question of whether uniformly-tight generalization bounds are at all possible +in the overparameterized setting. We consider two types of generalization +bounds: (1) bounds that may depend on the training set and the learned +hypothesis (e.g., margin bounds). We prove mathematically that no such bound +can be uniformly tight in the overparameterized setting; (2) bounds that may in +addition also depend on the learning algorithm (e.g., stability bounds). For +these bounds, we show a trade-off between the algorithm's performance and the +bound's tightness. Namely, if the algorithm achieves good accuracy on certain +distributions, then no generalization bound can be uniformly tight for it in +the overparameterized setting. We explain how these formal results can, in our +view, inform research on generalization bounds for neural networks, while +stressing that other interpretations of these results are also possible. + +
+
+ comment: 34 pages, 1 figure. Minor fix: subsection 6.2 -> section 7 +
+
+
+
+
+ + ♻ ☆ Towards Attributions of Input Variables in a Coalition + + +
+ This paper aims to develop a new attribution method to explain the conflict +between individual variables' attributions and their coalition's attribution +from a fully new perspective. First, we find that the Shapley value can be +reformulated as the allocation of Harsanyi interactions encoded by the AI +model. Second, based the re-alloction of interactions, we extend the Shapley +value to the attribution of coalitions. Third we ective. We derive the +fundamental mechanism behind the conflict. This conflict come from the +interaction containing partial variables in their coalition. + +
+
+
+
+
+ + ♻ ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models + + +
+ The popularity of pre-trained large models has revolutionized downstream +tasks across diverse fields, such as language, vision, and multi-modality. To +minimize the adaption cost for downstream tasks, many Parameter-Efficient +Fine-Tuning (PEFT) techniques are proposed for language and 2D image +pre-trained models. However, the specialized PEFT method for 3D pre-trained +models is still under-explored. To this end, we introduce Point-PEFT, a novel +framework for adapting point cloud pre-trained models with minimal learnable +parameters. Specifically, for a pre-trained 3D model, we freeze most of its +parameters, and only tune the newly added PEFT modules on downstream tasks, +which consist of a Point-prior Prompt and a Geometry-aware Adapter. The +Point-prior Prompt adopts a set of learnable prompt tokens, for which we +propose to construct a memory bank with domain-specific knowledge, and utilize +a parameter-free attention to enhance the prompt tokens. The Geometry-aware +Adapter aims to aggregate point cloud features within spatial neighborhoods to +capture fine-grained geometric information through local interactions. +Extensive experiments indicate that our Point-PEFT can achieve better +performance than the full fine-tuning on various downstream tasks, while using +only 5% of the trainable parameters, demonstrating the efficiency and +effectiveness of our approach. Code will be released at +https://github.com/Even-JK/PEFT-3D. + +
+
+ comment: 10 pages. The specialized PEFT framework for 3D pre-trained models, + which achieves competitive performance to full fine-tuning, and significantly + reduces the computational resources. Project page: + https://github.com/Even-JK/PEFT-3D +
+
+
+
+
+ + ♻ ☆ Policy Learning with Asymmetric Counterfactual Utilities + + +
+ Data-driven decision making plays an important role even in high stakes +settings like medicine and public policy. Learning optimal policies from +observed data requires a careful formulation of the utility function whose +expected value is maximized across a population. Although researchers typically +use utilities that depend on observed outcomes alone, in many settings the +decision maker's utility function is more properly characterized by the joint +set of potential outcomes under all actions. For example, the Hippocratic +principle to "do no harm" implies that the cost of causing death to a patient +who would otherwise survive without treatment is greater than the cost of +forgoing life-saving treatment. We consider optimal policy learning with +asymmetric counterfactual utility functions of this form that consider the +joint set of potential outcomes. We show that asymmetric counterfactual +utilities lead to an unidentifiable expected utility function, and so we first +partially identify it. Drawing on statistical decision theory, we then derive +minimax decision rules by minimizing the maximum expected utility loss relative +to different alternative policies. We show that one can learn minimax loss +decision rules from observed data by solving intermediate classification +problems, and establish that the finite sample excess expected utility loss of +this procedure is bounded by the regret of these intermediate classifiers. We +apply this conceptual framework and methodology to the decision about whether +or not to use right heart catheterization for patients with possible pulmonary +hypertension. + +
+
+
+
+
+ + ♻ ☆ LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models + + +
+ Quantization is an indispensable technique for serving Large Language Models +(LLMs) and has recently found its way into LoRA fine-tuning. In this work we +focus on the scenario where quantization and LoRA fine-tuning are applied +together on a pre-trained model. In such cases it is common to observe a +consistent gap in the performance on downstream tasks between full fine-tuning +and quantization plus LoRA fine-tuning approach. In response, we propose LoftQ +(LoRA-Fine-Tuning-aware Quantization), a novel quantization framework that +simultaneously quantizes an LLM and finds a proper low-rank initialization for +LoRA fine-tuning. Such an initialization alleviates the discrepancy between the +quantized and full-precision model and significantly improves generalization in +downstream tasks. We evaluate our method on natural language understanding, +question answering, summarization, and natural language generation tasks. +Experiments show that our method is highly effective and outperforms existing +quantization methods, especially in the challenging 2-bit and 2/4-bit mixed +precision regimes. The code is available on https://github.com/yxli2123/LoftQ. + +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ Breaking Boundaries: Balancing Performance and Robustness in Deep + Wireless Traffic Forecasting CCS + + +
+ Balancing the trade-off between accuracy and robustness is a long-standing +challenge in time series forecasting. While most of existing robust algorithms +have achieved certain suboptimal performance on clean data, sustaining the same +performance level in the presence of data perturbations remains extremely hard. +In this paper, we study a wide array of perturbation scenarios and propose +novel defense mechanisms against adversarial attacks using real-world telecom +data. We compare our strategy against two existing adversarial training +algorithms under a range of maximal allowed perturbations, defined using +$\ell_{\infty}$-norm, $\in [0.1,0.4]$. Our findings reveal that our hybrid +strategy, which is composed of a classifier to detect adversarial examples, a +denoiser to eliminate noise from the perturbed data samples, and a standard +forecaster, achieves the best performance on both clean and perturbed data. Our +optimal model can retain up to $92.02\%$ the performance of the original +forecasting model in terms of Mean Squared Error (MSE) on clean data, while +being more robust than the standard adversarially trained models on perturbed +data. Its MSE is 2.71$\times$ and 2.51$\times$ lower than those of comparing +methods on normal and perturbed data, respectively. In addition, the components +of our models can be trained in parallel, resulting in better computational +efficiency. Our results indicate that we can optimally balance the trade-off +between the performance and robustness of forecasting models by improving the +classifier and denoiser, even in the presence of sophisticated and destructive +poisoning attacks. + +
+
+ comment: Accepted for presentation at the ARTMAN workshop, part of the ACM + Conference on Computer and Communications Security (CCS), 2023 +
+
+
+
+
+ + ♻ ☆ FeTrIL: Feature Translation for Exemplar-Free Class-Incremental Learning + + +
+ Exemplar-free class-incremental learning is very challenging due to the +negative effect of catastrophic forgetting. A balance between stability and +plasticity of the incremental process is needed in order to obtain good +accuracy for past as well as new classes. Existing exemplar-free +class-incremental methods focus either on successive fine tuning of the model, +thus favoring plasticity, or on using a feature extractor fixed after the +initial incremental state, thus favoring stability. We introduce a method which +combines a fixed feature extractor and a pseudo-features generator to improve +the stability-plasticity balance. The generator uses a simple yet effective +geometric translation of new class features to create representations of past +classes, made of pseudo-features. The translation of features only requires the +storage of the centroid representations of past classes to produce their +pseudo-features. Actual features of new classes and pseudo-features of past +classes are fed into a linear classifier which is trained incrementally to +discriminate between all classes. The incremental process is much faster with +the proposed method compared to mainstream ones which update the entire deep +model. Experiments are performed with three challenging datasets, and different +incremental settings. A comparison with ten existing methods shows that our +method outperforms the others in most cases. + +
+
+
+
+
+ + ♻ ☆ Marsellus: A Heterogeneous RISC-V AI-IoT End-Node SoC with 2-to-8b DNN + Acceleration and 30%-Boost Adaptive Body Biasing SC + + +
+ Emerging Artificial Intelligence-enabled Internet-of-Things (AI-IoT) +System-on-a-Chip (SoC) for augmented reality, personalized healthcare, and +nano-robotics need to run many diverse tasks within a power envelope of a few +tens of mW over a wide range of operating conditions: compute-intensive but +strongly quantized Deep Neural Network (DNN) inference, as well as signal +processing and control requiring high-precision floating-point. We present +Marsellus, an all-digital heterogeneous SoC for AI-IoT end-nodes fabricated in +GlobalFoundries 22nm FDX that combines 1) a general-purpose cluster of 16 +RISC-V Digital Signal Processing (DSP) cores attuned for the execution of a +diverse range of workloads exploiting 4-bit and 2-bit arithmetic extensions +(XpulpNN), combined with fused MAC&LOAD operations and floating-point support; +2) a 2-8bit Reconfigurable Binary Engine (RBE) to accelerate 3x3 and 1x1 +(pointwise) convolutions in DNNs; 3) a set of On-Chip Monitoring (OCM) blocks +connected to an Adaptive Body Biasing (ABB) generator and a hardware control +loop, enabling on-the-fly adaptation of transistor threshold voltages. +Marsellus achieves up to 180 Gop/s or 3.32 Top/s/W on 2-bit precision +arithmetic in software, and up to 637 Gop/s or 12.4 Top/s/W on +hardware-accelerated DNN layers. + +
+
+ comment: Post-print accepted by IEEE Journal of Solid-State Circuits. Fixed + metadata (was missing one co-author), added DOI of IEEE JSSC +
+
+
+
+
+ + ♻ ☆ Replay across Experiments: A Natural Extension of Off-Policy RL + + +
+ Replaying data is a principal mechanism underlying the stability and data +efficiency of off-policy reinforcement learning (RL). We present an effective +yet simple framework to extend the use of replays across multiple experiments, +minimally adapting the RL workflow for sizeable improvements in controller +performance and research iteration times. At its core, Replay Across +Experiments (RaE) involves reusing experience from previous experiments to +improve exploration and bootstrap learning while reducing required changes to a +minimum in comparison to prior work. We empirically show benefits across a +number of RL algorithms and challenging control domains spanning both +locomotion and manipulation, including hard exploration tasks from egocentric +vision. Through comprehensive ablations, we demonstrate robustness to the +quality and amount of data available and various hyperparameter choices. +Finally, we discuss how our approach can be applied more broadly across +research life cycles and can increase resilience by reloading data across +random seeds or hyperparameter variations. + +
+
+
+
+
+ + ♻ ☆ High-performance real-world optical computing trained by in situ + model-free optimization + + +
+ Optical computing systems provide high-speed and low-energy data processing +but face deficiencies in computationally demanding training and +simulation-to-reality gaps. We propose a model-free optimization (MFO) method +based on a score gradient estimation algorithm for computationally efficient in +situ training of optical computing systems. This approach treats an optical +computing system as a black box and back-propagates the loss directly to the +optical computing weights' probability distributions, circumventing the need +for a computationally heavy and biased system simulation. Our experiments on a +single-layer diffractive optical computing system show that MFO outperforms +hybrid training on the MNIST and FMNIST datasets. Furthermore, we demonstrate +image-free and high-speed classification of cells from their phase maps. Our +method's model-free and high-performance nature, combined with its low demand +for computational resources, expedites the transition of optical computing from +laboratory demonstrations to real-world applications. + +
+
+
+
+
+ + ♻ ☆ SE(3) Equivariant Augmented Coupling Flows + + +
+ Coupling normalizing flows allow for fast sampling and density evaluation, +making them the tool of choice for probabilistic modeling of physical systems. +However, the standard coupling architecture precludes endowing flows that +operate on the Cartesian coordinates of atoms with the SE(3) and permutation +invariances of physical systems. This work proposes a coupling flow that +preserves SE(3) and permutation equivariance by performing coordinate splits +along additional augmented dimensions. At each layer, the flow maps atoms' +positions into learned SE(3) invariant bases, where we apply standard flow +transformations, such as monotonic rational-quadratic splines, before returning +to the original basis. Crucially, our flow preserves fast sampling and density +evaluation, and may be used to produce unbiased estimates of expectations with +respect to the target distribution via importance sampling. When trained on the +DW4, LJ13, and QM9-positional datasets, our flow is competitive with +equivariant continuous normalizing flows, while allowing sampling more than an +order of magnitude faster. Moreover, to the best of our knowledge, we are the +first to learn the full Boltzmann distribution of alanine dipeptide by only +modeling the Cartesian positions of its atoms. Lastly, we demonstrate that our +flow can be trained to approximately sample from the Boltzmann distribution of +the DW4 and LJ13 particle systems using only their energy functions. + +
+
+
+
+
+ + ♻ ☆ Explaining Deep Learning Models for Age-related Gait Classification + based on time series acceleration + + +
+ Gait analysis holds significant importance in monitoring daily health, +particularly among older adults. Advancements in sensor technology enable the +capture of movement in real-life environments and generate big data. Machine +learning, notably deep learning (DL), shows promise to use these big data in +gait analysis. However, the inherent black-box nature of these models poses +challenges for their clinical application. This study aims to enhance +transparency in DL-based gait classification for aged-related gait patterns +using Explainable Artificial Intelligence, such as SHAP. + A total of 244 subjects, comprising 129 adults and 115 older adults (age>65), +were included. They performed a 3-minute walking task while accelerometers were +affixed to the lumbar segment L3. DL models, convolutional neural network (CNN) +and gated recurrent unit (GRU), were trained using 1-stride and 8-stride +accelerations, respectively, to classify adult and older adult groups. SHAP was +employed to explain the models' predictions. + CNN achieved a satisfactory performance with an accuracy of 81.4% and an AUC +of 0.89, and GRU demonstrated promising results with an accuracy of 84.5% and +an AUC of 0.94. SHAP analysis revealed that both CNN and GRU assigned higher +SHAP values to the data from vertical and walking directions, particularly +emphasizing data around heel contact, spanning from the terminal swing to +loading response phases. Furthermore, SHAP values indicated that GRU did not +treat every stride equally. + CNN accurately distinguished between adults and older adults based on the +characteristics of a single stride's data. GRU achieved accurate classification +by considering the relationships and subtle differences between strides. In +both models, data around heel contact emerged as most critical, suggesting +differences in acceleration and deceleration patterns during walking between +different age groups. + +
+
+
+
+
+ + ♻ ☆ Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation + + +
+ Current AI-based methods do not provide comprehensible physical +interpretations of the utilized data, extracted features, and +predictions/inference operations. As a result, deep learning models trained +using high-resolution satellite imagery lack transparency and explainability +and can be merely seen as a black box, which limits their wide-level adoption. +Experts need help understanding the complex behavior of AI models and the +underlying decision-making process. The explainable artificial intelligence +(XAI) field is an emerging field providing means for robust, practical, and +trustworthy deployment of AI models. Several XAI techniques have been proposed +for image classification tasks, whereas the interpretation of image +segmentation remains largely unexplored. This paper offers to bridge this gap +by adapting the recent XAI classification algorithms and making them usable for +muti-class image segmentation, where we mainly focus on buildings' segmentation +from high-resolution satellite images. To benchmark and compare the performance +of the proposed approaches, we introduce a new XAI evaluation methodology and +metric based on "Entropy" to measure the model uncertainty. Conventional XAI +evaluation methods rely mainly on feeding area-of-interest regions from the +image back to the pre-trained (utility) model and then calculating the average +change in the probability of the target class. Those evaluation metrics lack +the needed robustness, and we show that using Entropy to monitor the model +uncertainty in segmenting the pixels within the target class is more suitable. +We hope this work will pave the way for additional XAI research for image +segmentation and applications in the remote sensing discipline. + +
+
+
+
+
+ + ♻ ☆ SR-OOD: Out-of-Distribution Detection via Sample Repairing + + +
+ Out-of-distribution (OOD) detection is a crucial task for ensuring the +reliability and robustness of machine learning models. Recent works have shown +that generative models often assign high confidence scores to OOD samples, +indicating that they fail to capture the semantic information of the data. To +tackle this problem, we take advantage of sample repairing and propose a novel +OOD detection framework, namely SR-OOD. Our framework leverages the idea that +repairing an OOD sample can reveal its semantic inconsistency with the +in-distribution data. Specifically, our framework consists of two components: a +sample repairing module and a detection module. The sample repairing module +applies erosion to an input sample and uses a generative adversarial network to +repair it. The detection module then determines whether the input sample is OOD +using a distance metric. Our framework does not require any additional data or +label information for detection, making it applicable to various scenarios. We +conduct extensive experiments on three image datasets: CIFAR-10, CelebA, and +Pokemon. The results demonstrate that our approach achieves superior +performance over the state-of-the-art generative methods in OOD detection. + +
+
+ comment: This is an updated version of the paper +
+
+
+
+
+ + ♻ ☆ Decentralized Online Federated G-Network Learning for Lightweight + Intrusion Detection + + +
+ Cyberattacks are increasingly threatening networked systems, often with the +emergence of new types of unknown (zero-day) attacks and the rise of vulnerable +devices. Such attacks can also target multiple components of a Supply Chain, +which can be protected via Machine Learning (ML)-based Intrusion Detection +Systems (IDSs). However, the need to learn large amounts of labelled data often +limits the applicability of ML-based IDSs to cybersystems that only have access +to private local data, while distributed systems such as Supply Chains have +multiple components, each of which must preserve its private data while being +targeted by the same attack To address this issue, this paper proposes a novel +Decentralized and Online Federated Learning Intrusion Detection (DOF-ID) +architecture based on the G-Network model with collaborative learning, that +allows each IDS used by a specific component to learn from the experience +gained in other components, in addition to its own local data, without +violating the data privacy of other components. The performance evaluation +results using public Kitsune and Bot-IoT datasets show that DOF-ID +significantly improves the intrusion detection performance in all of the +collaborating components, with acceptable computation time for online learning. + +
+
+
+
+
+ + ♻ ☆ ID-like Prompt Learning for Few-Shot Out-of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection methods often exploit auxiliary outliers +to train model identifying OOD samples, especially discovering challenging +outliers from auxiliary outliers dataset to improve OOD detection. However, +they may still face limitations in effectively distinguishing between the most +challenging OOD samples that are much like in-distribution (ID) data, i.e., +ID-like samples. To this end, we propose a novel OOD detection framework that +discovers ID-like outliers using CLIP from the vicinity space of the ID +samples, thus helping to identify these most challenging OOD samples. Then a +prompt learning framework is proposed that utilizes the identified ID-like +outliers to further leverage the capabilities of CLIP for OOD detection. +Benefiting from the powerful CLIP, we only need a small number of ID samples to +learn the prompts of the model without exposing other auxiliary outlier +datasets. By focusing on the most challenging ID-like OOD samples and elegantly +exploiting the capabilities of CLIP, our method achieves superior few-shot +learning performance on various real-world image datasets (e.g., in 4-shot OOD +detection on the ImageNet-1k dataset, our method reduces the average FPR95 by +12.16% and improves the average AUROC by 2.76%, compared to state-of-the-art +methods). + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ A Survey of Graph Meets Large Language Model: Progress and Future + Directions + + +
+ Graph plays a significant role in representing and analyzing complex +relationships in real-world applications such as citation networks, social +networks, and biological data. Recently, Large Language Models (LLMs), which +have achieved tremendous success in various domains, have also been leveraged +in graph-related tasks to surpass traditional Graph Neural Networks (GNNs) +based methods and yield state-of-the-art performance. In this survey, we first +present a comprehensive review and analysis of existing methods that integrate +LLMs with graphs. First of all, we propose a new taxonomy, which organizes +existing methods into three categories based on the role (i.e., enhancer, +predictor, and alignment component) played by LLMs in graph-related tasks. Then +we systematically survey the representative methods along the three categories +of the taxonomy. Finally, we discuss the remaining limitations of existing +studies and highlight promising avenues for future research. The relevant +papers are summarized and will be consistently updated at: +https://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks. + +
+
+ comment: Work in progress; 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Kernelized Reinforcement Learning with Order Optimal Regret Bounds NeurIPS + + +
+ Reinforcement learning (RL) has shown empirical success in various real world +settings with complex models and large state-action spaces. The existing +analytical results, however, typically focus on settings with a small number of +state-actions or simple models such as linearly modeled state-action value +functions. To derive RL policies that efficiently handle large state-action +spaces with more general value functions, some recent works have considered +nonlinear function approximation using kernel ridge regression. We propose +$\pi$-KRVI, an optimistic modification of least-squares value iteration, when +the state-action value function is represented by a reproducing kernel Hilbert +space (RKHS). We prove the first order-optimal regret guarantees under a +general setting. Our results show a significant polynomial in the number of +episodes improvement over the state of the art. In particular, with highly +non-smooth kernels (such as Neural Tangent kernel or some Mat\'ern kernels) the +existing results lead to trivial (superlinear in the number of episodes) regret +bounds. We show a sublinear regret bound that is order optimal in the case of +Mat\'ern kernels where a lower bound on regret is known. + +
+
+ comment: Advances in Neural Information Processing Systems (NeurIPS) +
+
+
+
+
+ + ♻ ☆ Addressing the Impact of Localized Training Data in Graph Neural + Networks + + +
+ Graph Neural Networks (GNNs) have achieved notable success in learning from +graph-structured data, owing to their ability to capture intricate dependencies +and relationships between nodes. They excel in various applications, including +semi-supervised node classification, link prediction, and graph generation. +However, it is important to acknowledge that the majority of state-of-the-art +GNN models are built upon the assumption of an in-distribution setting, which +hinders their performance on real-world graphs with dynamic structures. In this +article, we aim to assess the impact of training GNNs on localized subsets of +the graph. Such restricted training data may lead to a model that performs well +in the specific region it was trained on but fails to generalize and make +accurate predictions for the entire graph. In the context of graph-based +semi-supervised learning (SSL), resource constraints often lead to scenarios +where the dataset is large, but only a portion of it can be labeled, affecting +the model's performance. This limitation affects tasks like anomaly detection +or spam detection when labeling processes are biased or influenced by human +subjectivity. To tackle the challenges posed by localized training data, we +approach the problem as an out-of-distribution (OOD) data issue by by aligning +the distributions between the training data, which represents a small portion +of labeled data, and the graph inference process that involves making +predictions for the entire graph. We propose a regularization method to +minimize distributional discrepancies between localized training data and graph +inference, improving model performance on OOD data. Extensive tests on popular +GNN models show significant performance improvement on three citation GNN +benchmark datasets. The regularization approach effectively enhances model +adaptation and generalization, overcoming challenges posed by OOD data. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Likelihood-based Sensor Calibration using Affine Transformation + + +
+ An important task in the field of sensor technology is the efficient +implementation of adaptation procedures of measurements from one sensor to +another sensor of identical design. One idea is to use the estimation of an +affine transformation between different systems, which can be improved by the +knowledge of experts. This paper presents an improved solution from Glacier +Research that was published back in 1973. The results demonstrate the +adaptability of this solution for various applications, including software +calibration of sensors, implementation of expert-based adaptation, and paving +the way for future advancements such as distributed learning methods. One idea +here is to use the knowledge of experts for estimating an affine transformation +between different systems. We evaluate our research with simulations and also +with real measured data of a multi-sensor board with 8 identical sensors. Both +data set and evaluation script are provided for download. The results show an +improvement for both the simulation and the experiments with real data. + +
+
+
+
+
+ + ♻ ☆ Geometric instability of graph neural networks on large graphs + + +
+ We analyse the geometric instability of embeddings produced by graph neural +networks (GNNs). Existing methods are only applicable for small graphs and lack +context in the graph domain. We propose a simple, efficient and graph-native +Graph Gram Index (GGI) to measure such instability which is invariant to +permutation, orthogonal transformation, translation and order of evaluation. +This allows us to study the varying instability behaviour of GNN embeddings on +large graphs for both node classification and link prediction. + +
+
+
+
+
+ + ♻ ☆ STLGRU: Spatio-Temporal Lightweight Graph GRU for Traffic Flow + Prediction + + +
+ Reliable forecasting of traffic flow requires efficient modeling of traffic +data. Different correlations and influences arise in a dynamic traffic network, +making modeling a complicated task. Existing literature has proposed many +different methods to capture the complex underlying spatial-temporal relations +of traffic networks. However, methods still struggle to capture different local +and global dependencies of long-range nature. Also, as more and more +sophisticated methods are being proposed, models are increasingly becoming +memory-heavy and, thus, unsuitable for low-powered devices. In this paper, we +focus on solving these problems by proposing a novel deep learning framework - +STLGRU. Specifically, our proposed STLGRU can effectively capture both local +and global spatial-temporal relations of a traffic network using +memory-augmented attention and gating mechanism. Instead of employing separate +temporal and spatial components, we show that our memory module and gated unit +can learn the spatial-temporal dependencies successfully, allowing for reduced +memory usage with fewer parameters. We extensively experiment on several +real-world traffic prediction datasets to show that our model performs better +than existing methods while the memory footprint remains lower. Code is +available at \url{https://github.com/Kishor-Bhaumik/STLGRU}. + +
+
+ comment: We withdraw for now and shall further work on the manuscript and + upload it again +
+
+
+
+
+ + ♻ ☆ Segmentation of diagnostic tissue compartments on whole slide images + with renal thrombotic microangiopathies (TMAs) + + +
+ The thrombotic microangiopathies (TMAs) manifest in renal biopsy histology +with a broad spectrum of acute and chronic findings. Precise diagnostic +criteria for a renal biopsy diagnosis of TMA are missing. As a first step +towards a machine learning- and computer vision-based analysis of wholes slide +images from renal biopsies, we trained a segmentation model for the decisive +diagnostic kidney tissue compartments artery, arteriole, glomerulus on a set of +whole slide images from renal biopsies with TMAs and Mimickers (distinct +diseases with a similar nephropathological appearance as TMA like severe benign +nephrosclerosis, various vasculitides, Bevacizumab-plug glomerulopathy, +arteriolar light chain deposition disease). Our segmentation model combines a +U-Net-based tissue detection with a Shifted windows-transformer architecture to +reach excellent segmentation results for even the most severely altered +glomeruli, arterioles and arteries, even on unseen staining domains from a +different nephropathology lab. With accurate automatic segmentation of the +decisive renal biopsy compartments in human renal vasculopathies, we have laid +the foundation for large-scale compartment-specific machine learning and +computer vision analysis of renal biopsy repositories with TMAs. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ On the Role of Randomization in Adversarially Robust Classification + + +
+ Deep neural networks are known to be vulnerable to small adversarial +perturbations in test data. To defend against adversarial attacks, +probabilistic classifiers have been proposed as an alternative to deterministic +ones. However, literature has conflicting findings on the effectiveness of +probabilistic classifiers in comparison to deterministic ones. In this paper, +we clarify the role of randomization in building adversarially robust +classifiers. Given a base hypothesis set of deterministic classifiers, we show +the conditions under which a randomized ensemble outperforms the hypothesis set +in adversarial risk, extending previous results. Additionally, we show that for +any probabilistic binary classifier (including randomized ensembles), there +exists a deterministic classifier that outperforms it. Finally, we give an +explicit description of the deterministic hypothesis set that contains such a +deterministic classifier for many types of commonly used probabilistic +classifiers, i.e. randomized ensembles and parametric/input noise injection. + +
+
+ comment: 10 pages main paper (27 total), 2 figures in main paper. Neurips 2023 +
+
+
+
+
+ + ♻ ☆ Robust Ocean Subgrid-Scale Parameterizations Using Fourier Neural + Operators + + +
+ In climate simulations, small-scale processes shape ocean dynamics but remain +computationally expensive to resolve directly. For this reason, their +contributions are commonly approximated using empirical parameterizations, +which lead to significant errors in long-term projections. In this work, we +develop parameterizations based on Fourier Neural Operators, showcasing their +accuracy and generalizability in comparison to other approaches. Finally, we +discuss the potential and limitations of neural networks operating in the +frequency domain, paving the way for future investigation. + +
+
+
+
+
+ + ♻ ☆ COVID-19 detection using ViT transformer-based approach from Computed + Tomography Images + + +
+ In here, we introduce a novel approach to enhance the accuracy and efficiency +of COVID-19 diagnosis using CT images. Leveraging state-of-the-art Transformer +models in computer vision, we employed the base ViT Transformer configured for +224x224-sized input images, modifying the output to suit the binary +classification task. Notably, input images were resized from the standard CT +scan size of 512x512 to match the model's expectations. Our method implements a +systematic patient-level prediction strategy, classifying individual CT slices +as COVID-19 or non-COVID. To determine the overall diagnosis for each patient, +a majority voting approach as well as other thresholding approaches were +employed. This method involves evaluating all CT slices for a given patient and +assigning the patient the diagnosis that relates to the thresholding for the CT +scan. This meticulous patient-level prediction process contributes to the +robustness of our solution as it starts from 2D-slices to 3D-patient level. +Throughout the evaluation process, our approach resulted in 0.7 macro F1 score +on the COV19-CT -DB validation set. To ensure the reliability and effectiveness +of our model, we rigorously validate it on the extensive COV-19 CT dataset, +which is meticulously annotated for the task. This dataset, with its +comprehensive annotations, reinforces the overall robustness of our solution. + +
+
+
+
+
+ + ♻ ☆ FairShap: A Data Re-weighting Approach for Algorithmic Fairness based on + Shapley Values + + +
+ Algorithmic fairness is of utmost societal importance, yet the current trend +in large-scale machine learning models requires training with massive datasets +that are frequently biased. In this context, pre-processing methods that focus +on modeling and correcting bias in the data emerge as valuable approaches. In +this paper, we propose FairShap, a novel instance-level data re-weighting +method for fair algorithmic decision-making through data valuation by means of +Shapley Values. FairShap is model-agnostic and easily interpretable, as it +measures the contribution of each training data point to a predefined fairness +metric. We empirically validate FairShap on several state-of-the-art datasets +of different nature, with a variety of training scenarios and models and show +how it yields fairer models with similar levels of accuracy than the baselines. +We illustrate FairShap's interpretability by means of histograms and latent +space visualizations. Moreover, we perform a utility-fairness study, and +ablation and runtime experiments to illustrate the impact of the size of the +reference dataset and FairShap's computational cost depending on the size of +the dataset and the number of features. We believe that FairShap represents a +promising direction in interpretable and model-agnostic approaches to +algorithmic fairness that yield competitive accuracy even when only biased +datasets are available. + +
+
+ comment: 33 pages, 11 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ BatteryML:An Open-source platform for Machine Learning on Battery + Degradation + + +
+ Battery degradation remains a pivotal concern in the energy storage domain, +with machine learning emerging as a potent tool to drive forward insights and +solutions. However, this intersection of electrochemical science and machine +learning poses complex challenges. Machine learning experts often grapple with +the intricacies of battery science, while battery researchers face hurdles in +adapting intricate models tailored to specific datasets. Beyond this, a +cohesive standard for battery degradation modeling, inclusive of data formats +and evaluative benchmarks, is conspicuously absent. Recognizing these +impediments, we present BatteryML - a one-step, all-encompass, and open-source +platform designed to unify data preprocessing, feature extraction, and the +implementation of both traditional and state-of-the-art models. This +streamlined approach promises to enhance the practicality and efficiency of +research applications. BatteryML seeks to fill this void, fostering an +environment where experts from diverse specializations can collaboratively +contribute, thus elevating the collective understanding and advancement of +battery research.The code for our project is publicly available on GitHub at +https://github.com/microsoft/BatteryML. + +
+
+
+
+
+ + ♻ ☆ Post-hoc Interpretability for Neural NLP: A Survey + + +
+ Neural networks for NLP are becoming increasingly complex and widespread, and +there is a growing concern if these models are responsible to use. Explaining +models helps to address the safety and ethical concerns and is essential for +accountability. Interpretability serves to provide these explanations in terms +that are understandable to humans. Additionally, post-hoc methods provide +explanations after a model is learned and are generally model-agnostic. This +survey provides a categorization of how recent post-hoc interpretability +methods communicate explanations to humans, it discusses each method in-depth, +and how they are validated, as the latter is often a common concern. + +
+
+
+
+
+ + ♻ ☆ Towards Optimizing with Large Language Models + + +
+ In this work, we conduct an assessment of the optimization capabilities of +LLMs across various tasks and data sizes. Each of these tasks corresponds to +unique optimization domains, and LLMs are required to execute these tasks with +interactive prompting. That is, in each optimization step, the LLM generates +new solutions from the past generated solutions with their values, and then the +new solutions are evaluated and considered in the next optimization step. +Additionally, we introduce three distinct metrics for a comprehensive +assessment of task performance from various perspectives. These metrics offer +the advantage of being applicable for evaluating LLM performance across a broad +spectrum of optimization tasks and are less sensitive to variations in test +samples. By applying these metrics, we observe that LLMs exhibit strong +optimization capabilities when dealing with small-sized samples. However, their +performance is significantly influenced by factors like data size and values, +underscoring the importance of further research in the domain of optimization +tasks for LLMs. + +
+
+
+
+
+ + ♻ ☆ Long Range Graph Benchmark NeurIPS 2022 + + +
+ Graph Neural Networks (GNNs) that are based on the message passing (MP) +paradigm generally exchange information between 1-hop neighbors to build node +representations at each layer. In principle, such networks are not able to +capture long-range interactions (LRI) that may be desired or necessary for +learning a given task on graphs. Recently, there has been an increasing +interest in development of Transformer-based methods for graphs that can +consider full node connectivity beyond the original sparse structure, thus +enabling the modeling of LRI. However, MP-GNNs that simply rely on 1-hop +message passing often fare better in several existing graph benchmarks when +combined with positional feature representations, among other innovations, +hence limiting the perceived utility and ranking of Transformer-like +architectures. Here, we present the Long Range Graph Benchmark (LRGB) with 5 +graph learning datasets: PascalVOC-SP, COCO-SP, PCQM-Contact, Peptides-func and +Peptides-struct that arguably require LRI reasoning to achieve strong +performance in a given task. We benchmark both baseline GNNs and Graph +Transformer networks to verify that the models which capture long-range +dependencies perform significantly better on these tasks. Therefore, these +datasets are suitable for benchmarking and exploration of MP-GNNs and Graph +Transformer architectures that are intended to capture LRI. + +
+
+ comment: Added reference to T\"onshoff et al., 2023 in Sec. 4.1; NeurIPS 2022 + Track on D&B; Open-sourced at: https://github.com/vijaydwivedi75/lrgb +
+
+
+
+
+ + ♻ ☆ FIXED: Frustratingly Easy Domain Generalization with Mixup + + +
+ Domain generalization (DG) aims to learn a generalizable model from multiple +training domains such that it can perform well on unseen target domains. A +popular strategy is to augment training data to benefit generalization through +methods such as Mixup~\cite{zhang2018mixup}. While the vanilla Mixup can be +directly applied, theoretical and empirical investigations uncover several +shortcomings that limit its performance. Firstly, Mixup cannot effectively +identify the domain and class information that can be used for learning +invariant representations. Secondly, Mixup may introduce synthetic noisy data +points via random interpolation, which lowers its discrimination capability. +Based on the analysis, we propose a simple yet effective enhancement for +Mixup-based DG, namely domain-invariant Feature mIXup (FIX). It learns +domain-invariant representations for Mixup. To further enhance discrimination, +we leverage existing techniques to enlarge margins among classes to further +propose the domain-invariant Feature MIXup with Enhanced Discrimination (FIXED) +approach. We present theoretical insights about guarantees on its +effectiveness. Extensive experiments on seven public datasets across two +modalities including image classification (Digits-DG, PACS, Office-Home) and +time series (DSADS, PAMAP2, UCI-HAR, and USC-HAD) demonstrate that our approach +significantly outperforms nine state-of-the-art related methods, beating the +best performing baseline by 6.5\% on average in terms of test accuracy. Code is +available at: +https://github.com/jindongwang/transferlearning/tree/master/code/deep/fixed. + +
+
+ comment: First Conference on Parsimony and Learning (CPAL) 2024; code for DG + at: https://github.com/jindongwang/transferlearning/tree/master/code/DeepDG +
+
+
+
+
+ + ♻ ☆ Interpreting Reward Models in RLHF-Tuned Language Models Using Sparse + Autoencoders + + +
+ Large language models (LLMs) aligned to human preferences via reinforcement +learning from human feedback (RLHF) underpin many commercial applications of +LLM technology. Despite this, the impacts of RLHF on LLM internals remain +opaque. We propose a novel method for interpreting implicit reward models +(IRMs) in LLMs learned through RLHF. Our approach trains pairs of autoencoders +on activations from a base LLM and its RLHF-tuned variant. Through a comparison +of autoencoder hidden spaces, we identify features that reflect the accuracy of +the learned IRM. To illustrate our method, we fine-tune an LLM via RLHF to +learn a token-utility mapping and maximize the aggregate utility of generated +text. This is the first application of sparse autoencoders to interpreting +IRMs. Our method provides an abstract approximation of reward integrity and +holds promise for measuring alignment between specified objectives and learned +model behaviors. + +
+
+
+
+
+ + ♻ ☆ Investigating the Impact of Weight Sharing Decisions on Knowledge + Transfer in Continual Learning + + +
+ Continual Learning (CL) has generated attention as a method of avoiding +Catastrophic Forgetting (CF) in the sequential training of neural networks, +improving network efficiency and adaptability to different tasks. Additionally, +CL serves as an ideal setting for studying network behavior and Forward +Knowledge Transfer (FKT) between tasks. Pruning methods for CL train +subnetworks to handle the sequential tasks which allows us to take a structured +approach to investigating FKT. Sharing prior subnetworks' weights leverages +past knowledge for the current task through FKT. Understanding which weights to +share is important as sharing all weights can yield sub-optimal accuracy. This +paper investigates how different sharing decisions affect the FKT between +tasks. Through this lens we demonstrate how task complexity and similarity +influence the optimal weight sharing decisions, giving insights into the +relationships between tasks and helping inform decision making in similar CL +methods. We implement three sequential datasets designed to emphasize variation +in task complexity and similarity, reporting results for both ResNet-18 and +VGG-16. By sharing in accordance with the decisions supported by our findings, +we show that we can improve task accuracy compared to other sharing decisions. + +
+
+ comment: 5 Figures, 4 Tables, 2 Algorithms +
+
+
+
+
+ + ♻ ☆ Neural General Circulation Models + + +
+ General circulation models (GCMs) are the foundation of weather and climate +prediction. GCMs are physics-based simulators which combine a numerical solver +for large-scale dynamics with tuned representations for small-scale processes +such as cloud formation. Recently, machine learning (ML) models trained on +reanalysis data achieved comparable or better skill than GCMs for deterministic +weather forecasting. However, these models have not demonstrated improved +ensemble forecasts, or shown sufficient stability for long-term weather and +climate simulations. Here we present the first GCM that combines a +differentiable solver for atmospheric dynamics with ML components, and show +that it can generate forecasts of deterministic weather, ensemble weather and +climate on par with the best ML and physics-based methods. NeuralGCM is +competitive with ML models for 1-10 day forecasts, and with the European Centre +for Medium-Range Weather Forecasts ensemble prediction for 1-15 day forecasts. +With prescribed sea surface temperature, NeuralGCM can accurately track climate +metrics such as global mean temperature for multiple decades, and climate +forecasts with 140 km resolution exhibit emergent phenomena such as realistic +frequency and trajectories of tropical cyclones. For both weather and climate, +our approach offers orders of magnitude computational savings over conventional +GCMs. Our results show that end-to-end deep learning is compatible with tasks +performed by conventional GCMs, and can enhance the large-scale physical +simulations that are essential for understanding and predicting the Earth +system. + +
+
+ comment: 67 pages, 34 figures +
+
+
+
+
+ + ♻ ☆ Wasserstein Distributionally Robust Estimation in High Dimensions: + Performance Analysis and Optimal Hyperparameter Tuning + + +
+ Wasserstein distributionally robust optimization has recently emerged as a +powerful framework for robust estimation, enjoying good out-of-sample +performance guarantees, well-understood regularization effects, and +computationally tractable reformulations. In such framework, the estimator is +obtained by minimizing the worst-case expected loss over all probability +distributions which are close, in a Wasserstein sense, to the empirical +distribution. In this paper, we propose a Wasserstein distributionally robust +estimation framework to estimate an unknown parameter from noisy linear +measurements, and we focus on the task of analyzing the squared error +performance of such estimators. Our study is carried out in the modern +high-dimensional proportional regime, where both the ambient dimension and the +number of samples go to infinity at a proportional rate which encodes the +under/over-parametrization of the problem. Under an isotropic Gaussian features +assumption, we show that the squared error can be recovered as the solution of +a convex-concave optimization problem which, surprinsingly, involves at most +four scalar variables. Importantly, the precise quantification of the squared +error allows to accurately and efficiently compare different ambiguity radii +and to understand the effect of the under/over-parametrization on the +estimation error. We conclude the paper with a list of exciting research +directions enabled by our results. + +
+
+ comment: This paper was previously titled "The Performance of Wasserstein + Distributionally Robust M-Estimators in High Dimensions" +
+
+
+
+
+ + ♻ ☆ Lion Secretly Solves Constrained Optimization: As Lyapunov Predicts + + +
+ Lion (Evolved Sign Momentum), a new optimizer discovered through program +search, has shown promising results in training large AI models. It performs +comparably or favorably to AdamW but with greater memory efficiency. As we can +expect from the results of a random search program, Lion incorporates elements +from several existing algorithms, including signed momentum, decoupled weight +decay, Polak, and Nesterov momentum, but does not fit into any existing +category of theoretically grounded optimizers. Thus, even though Lion appears +to perform well as a general-purpose optimizer for a wide range of tasks, its +theoretical basis remains uncertain. This lack of theoretical clarity limits +opportunities to further enhance and expand Lion's efficacy. + This work aims to demystify Lion. Based on both continuous-time and +discrete-time analysis, we demonstrate that Lion is a theoretically novel and +principled approach for minimizing a general loss function $f(x)$ while +enforcing a bound constraint $\|x\|_\infty \leq 1/\lambda$. Lion achieves this +through the incorporation of decoupled weight decay, where $\lambda$ represents +the weight decay coefficient. Our analysis is made possible by the development +of a new Lyapunov function for the Lion updates. It applies to a broader family +of Lion-$\kappa$ algorithms, where the $\text{sign}(\cdot)$ operator in Lion is +replaced by the subgradient of a convex function $\kappa$, leading to the +solution of a general composite optimization problem of $\min_x f(x) + +\kappa^*(x)$. Our findings provide valuable insights into the dynamics of Lion +and pave the way for further improvements and extensions of Lion-related +algorithms. + +
+
+ comment: 31 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Revisiting LARS for Large Batch Training Generalization of Neural + Networks + + +
+ LARS and LAMB have emerged as prominent techniques in Large Batch Learning +(LBL) to ensure training stability in AI. Convergence stability is a challenge +in LBL, where the AI agent usually gets trapped in the sharp minimizer. To +address this challenge, warm-up is an efficient technique, but it lacks a +strong theoretical foundation. Specifically, the warm-up process often reduces +gradients in the early phase, inadvertently preventing the agent from escaping +the sharp minimizer early on. In light of this situation, we conduct empirical +experiments to analyze the behaviors of LARS and LAMB with and without a +warm-up strategy. Our analyses give a comprehensive insight into the behaviors +of LARS, LAMB, and the necessity of a warm-up technique in LBL, including an +explanation of their failure in many cases. Building upon these insights, we +propose a novel algorithm called Time Varying LARS (TVLARS), which facilitates +robust training in the initial phase without the need for warm-up. A +configurable sigmoid-like function is employed in TVLARS to replace the warm-up +process to enhance training stability. Moreover, TVLARS stimulates gradient +exploration in the early phase, thus allowing it to surpass the sharp minimizes +early on and gradually transition to LARS and achieving robustness of LARS in +the latter phases. Extensive experimental evaluations reveal that TVLARS +consistently outperforms LARS and LAMB in most cases, with improvements of up +to 2% in classification scenarios. Notably, in every case of self-supervised +learning, TVLARS dominates LARS and LAMB with performance improvements of up to +10%. + +
+
+
+
+
+ + ♻ ☆ HiFA: High-fidelity Text-to-3D Generation with Advanced Diffusion + Guidance + + +
+ The advancements in automatic text-to-3D generation have been remarkable. +Most existing methods use pre-trained text-to-image diffusion models to +optimize 3D representations like Neural Radiance Fields (NeRFs) via +latent-space denoising score matching. Yet, these methods often result in +artifacts and inconsistencies across different views due to their suboptimal +optimization approaches and limited understanding of 3D geometry. Moreover, the +inherent constraints of NeRFs in rendering crisp geometry and stable textures +usually lead to a two-stage optimization to attain high-resolution details. +This work proposes holistic sampling and smoothing approaches to achieve +high-quality text-to-3D generation, all in a single-stage optimization. We +compute denoising scores in the text-to-image diffusion model's latent and +image spaces. Instead of randomly sampling timesteps (also referred to as noise +levels in denoising score matching), we introduce a novel timestep annealing +approach that progressively reduces the sampled timestep throughout +optimization. To generate high-quality renderings in a single-stage +optimization, we propose regularization for the variance of z-coordinates along +NeRF rays. To address texture flickering issues in NeRFs, we introduce a kernel +smoothing technique that refines importance sampling weights coarse-to-fine, +ensuring accurate and thorough sampling in high-density regions. Extensive +experiments demonstrate the superiority of our method over previous approaches, +enabling the generation of highly detailed and view-consistent 3D assets +through a single-stage training process. + +
+
+ comment: Project page: https://hifa-team.github.io/HiFA-site/ +
+
+
+
+
+ + ♻ ☆ Geometry-Aware Adaptation for Pretrained Models NeurIPS 2023 + + +
+ Machine learning models -- including prominent zero-shot models -- are often +trained on datasets whose labels are only a small proportion of a larger label +space. Such spaces are commonly equipped with a metric that relates the labels +via distances between them. We propose a simple approach to exploit this +information to adapt the trained model to reliably predict new classes -- or, +in the case of zero-shot prediction, to improve its performance -- without any +additional training. Our technique is a drop-in replacement of the standard +prediction rule, swapping argmax with the Fr\'echet mean. We provide a +comprehensive theoretical analysis for this approach, studying (i) +learning-theoretic results trading off label space diameter, sample complexity, +and model dimension, (ii) characterizations of the full range of scenarios in +which it is possible to predict any unobserved class, and (iii) an optimal +active learning-like next class selection procedure to obtain optimal training +classes for when it is not possible to predict the entire range of unobserved +classes. Empirically, using easily-available external metrics, our proposed +approach, Loki, gains up to 29.7% relative improvement over SimCLR on ImageNet +and scales to hundreds of thousands of classes. When no such metric is +available, Loki can use self-derived metrics from class embeddings and obtains +a 10.5% improvement on pretrained zero-shot models such as CLIP. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ GraSS: Contrastive Learning with Gradient Guided Sampling Strategy for + Remote Sensing Image Semantic Segmentation + + +
+ Self-supervised contrastive learning (SSCL) has achieved significant +milestones in remote sensing image (RSI) understanding. Its essence lies in +designing an unsupervised instance discrimination pretext task to extract image +features from a large number of unlabeled images that are beneficial for +downstream tasks. However, existing instance discrimination based SSCL suffer +from two limitations when applied to the RSI semantic segmentation task: 1) +Positive sample confounding issue; 2) Feature adaptation bias. It introduces a +feature adaptation bias when applied to semantic segmentation tasks that +require pixel-level or object-level features. In this study, We observed that +the discrimination information can be mapped to specific regions in RSI through +the gradient of unsupervised contrastive loss, these specific regions tend to +contain singular ground objects. Based on this, we propose contrastive learning +with Gradient guided Sampling Strategy (GraSS) for RSI semantic segmentation. +GraSS consists of two stages: Instance Discrimination warm-up (ID warm-up) and +Gradient guided Sampling contrastive training (GS training). The ID warm-up +aims to provide initial discrimination information to the contrastive loss +gradients. The GS training stage aims to utilize the discrimination information +contained in the contrastive loss gradients and adaptively select regions in +RSI patches that contain more singular ground objects, in order to construct +new positive and negative samples. Experimental results on three open datasets +demonstrate that GraSS effectively enhances the performance of SSCL in +high-resolution RSI semantic segmentation. Compared to seven baseline methods +from five different types of SSCL, GraSS achieves an average improvement of +1.57\% and a maximum improvement of 3.58\% in terms of mean intersection over +the union. The source code is available at https://github.com/GeoX-Lab/GraSS + +
+
+ comment: 14 pages, 10 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ FedSOL: Stabilized Orthogonal Learning in Federated Learning + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +client data distributions are heterogeneous. Many previous FL algorithms have +addressed this issue by introducing various proximal restrictions. These +restrictions aim to encourage global alignment by constraining the deviation of +local learning from the global objective. However, they inherently limit local +learning by interfering with the original local objectives. Recently, an +alternative approach has emerged to improve local learning generality. By +obtaining local models within a smooth loss landscape, this approach mitigates +conflicts among different local objectives of the clients. Yet, it does not +ensure stable global alignment, as local learning does not take the global +objective into account. In this study, we propose Federated Stability on +Learning (FedSoL), which combines both the concepts of global alignment and +local generality. In FedSoL, the local learning seeks a parameter region robust +against proximal perturbations. This strategy introduces an implicit proximal +restriction effect in local learning while maintaining the original local +objective for parameter update. Our experiments show that FedSoL consistently +achieves state-of-the-art performance on various setups. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ PREM: A Simple Yet Effective Approach for Node-Level Graph Anomaly + Detection ICDM + 2023 + + +
+ Node-level graph anomaly detection (GAD) plays a critical role in identifying +anomalous nodes from graph-structured data in various domains such as medicine, +social networks, and e-commerce. However, challenges have arisen due to the +diversity of anomalies and the dearth of labeled data. Existing methodologies - +reconstruction-based and contrastive learning - while effective, often suffer +from efficiency issues, stemming from their complex objectives and elaborate +modules. To improve the efficiency of GAD, we introduce a simple method termed +PREprocessing and Matching (PREM for short). Our approach streamlines GAD, +reducing time and memory consumption while maintaining powerful anomaly +detection capabilities. Comprising two modules - a pre-processing module and an +ego-neighbor matching module - PREM eliminates the necessity for +message-passing propagation during training, and employs a simple contrastive +loss, leading to considerable reductions in training time and memory usage. +Moreover, through rigorous evaluations of five real-world datasets, our method +demonstrated robustness and effectiveness. Notably, when validated on the ACM +dataset, PREM achieved a 5% improvement in AUC, a 9-fold increase in training +speed, and sharply reduce memory usage compared to the most efficient baseline. + +
+
+ comment: Accepted by IEEE International Conference of Data Mining 2023 (ICDM + 2023) +
+
+
+
+
+ + ♻ ☆ OccamNet: A Fast Neural Model for Symbolic Regression at Scale + + +
+ Neural networks' expressiveness comes at the cost of complex, black-box +models that often extrapolate poorly beyond the domain of the training dataset, +conflicting with the goal of finding compact analytic expressions to describe +scientific data. We introduce OccamNet, a neural network model that finds +interpretable, compact, and sparse symbolic fits to data, \`a la Occam's razor. +Our model defines a probability distribution over functions with efficient +sampling and function evaluation. We train by sampling functions and biasing +the probability mass toward better fitting solutions, backpropagating using +cross-entropy matching in a reinforcement-learning loss. OccamNet can identify +symbolic fits for a variety of problems, including analytic and non-analytic +functions, implicit functions, and simple image classification, and can +outperform state-of-the-art symbolic regression methods on real-world +regression datasets. Our method requires a minimal memory footprint, fits +complicated functions in minutes on a single CPU, and scales on a GPU. + +
+
+
+
+
+ + ♻ ☆ Almost Equivariance via Lie Algebra Convolutions + + +
+ Recently, the equivariance of models with respect to a group action has +become an important topic of research in machine learning. However, imbuing an +architecture with a specific group equivariance imposes a strong prior on the +types of data transformations that the model expects to see. While +strictly-equivariant models enforce symmetries, real-world data does not always +conform to such strict equivariances, be it due to noise in the data or +underlying physical laws that encode only approximate or partial symmetries. In +such cases, the prior of strict equivariance can actually prove too strong and +cause models to underperform on real-world data. Therefore, in this work we +study a closely related topic, that of almost equivariance. We provide a +definition of almost equivariance that differs from those extant in the current +literature and give a practical method for encoding almost equivariance in +models by appealing to the Lie algebra of a Lie group. Specifically, we define +Lie algebra convolutions and demonstrate that they offer several benefits over +Lie group convolutions, including being well-defined for non-compact groups. +From there, we pivot to the realm of theory and demonstrate connections between +the notions of equivariance and isometry and those of almost equivariance and +almost isometry, respectively. We prove two existence theorems, one showing the +existence of almost isometries within bounded distance of isometries of a +general manifold, and another showing the converse for Hilbert spaces. We then +extend these theorems to prove the existence of almost equivariant manifold +embeddings within bounded distance of fully equivariant embedding functions, +subject to certain constraints on the group action and the function class. +Finally, we demonstrate the validity of our approach by benchmarking against +datasets in fully equivariant and almost equivariant settings. + +
+
+
+
+
+ + ♻ ☆ Stitched ViTs are Flexible Vision Backbones + + +
+ Large pretrained plain vision Transformers (ViTs) have been the workhorse for +many downstream tasks. However, existing works utilizing off-the-shelf ViTs are +inefficient in terms of training and deployment, because adopting ViTs with +individual sizes requires separate trainings and is restricted by fixed +performance-efficiency trade-offs. In this paper, we are inspired by stitchable +neural networks (SN-Net), which is a new framework that cheaply produces a +single model that covers rich subnetworks by stitching pretrained model +families, supporting diverse performance-efficiency trade-offs at runtime. +Building upon this foundation, we introduce SN-Netv2, a systematically improved +model stitching framework to facilitate downstream task adaptation. +Specifically, we first propose a two-way stitching scheme to enlarge the +stitching space. We then design a resource-constrained sampling strategy that +takes into account the underlying FLOPs distributions in the space for better +sampling. Finally, we observe that learning stitching layers as a low-rank +update plays an essential role on downstream tasks to stabilize training and +ensure a good Pareto frontier. With extensive experiments on ImageNet-1K, +ADE20K, COCO-Stuff-10K and NYUv2, SN-Netv2 demonstrates superior performance +over SN-Netv1 on downstream dense predictions and shows strong ability as a +flexible vision backbone, achieving great advantages in both training +efficiency and deployment flexibility. Code is available at +https://github.com/ziplab/SN-Netv2. + +
+
+ comment: Tech report +
+
+
+
+
+ + ♻ ☆ Certifying LLM Safety against Adversarial Prompting + + +
+ Large language models (LLMs) released for public use incorporate guardrails +to ensure their output is safe, often referred to as "model alignment." An +aligned language model should decline a user's request to produce harmful +content. However, such safety measures are vulnerable to adversarial attacks, +which add maliciously designed token sequences to a harmful prompt to bypass +the model's safety guards. In this work, we introduce erase-and-check, the +first framework to defend against adversarial prompts with verifiable safety +guarantees. We defend against three attack modes: i) adversarial suffix, which +appends an adversarial sequence at the end of the prompt; ii) adversarial +insertion, where the adversarial sequence is inserted anywhere in the middle of +the prompt; and iii) adversarial infusion, where adversarial tokens are +inserted at arbitrary positions in the prompt, not necessarily as a contiguous +block. Our experimental results demonstrate that this procedure can obtain +strong certified safety guarantees on harmful prompts while maintaining good +empirical performance on safe prompts. For example, against adversarial +suffixes of length 20, it certifiably detects 92% of harmful prompts and labels +94% of safe prompts correctly using the open-source language model Llama 2 as +the safety filter. We further improve the filter's performance, in terms of +accuracy and speed, by replacing Llama 2 with a DistilBERT safety classifier +fine-tuned on safe and harmful prompts. Additionally, we propose two efficient +empirical defenses: i) RandEC, a randomized version of erase-and-check that +evaluates the safety filter on a small subset of the erased subsequences, and +ii) GradEC, a gradient-based version that optimizes the erased tokens to remove +the adversarial sequence. The code for our experiments is available at +https://github.com/aounon/certified-llm-safety. + +
+
+
+
+
+ + ♻ ☆ Towards Improving the Generation Quality of Autoregressive Slot VAEs + + +
+ Unconditional scene inference and generation are challenging to learn jointly +with a single compositional model. Despite encouraging progress on models that +extract object-centric representations (''slots'') from images, unconditional +generation of scenes from slots has received less attention. This is primarily +because learning the multi-object relations necessary to imagine coherent +scenes is difficult. We hypothesize that most existing slot-based models have a +limited ability to learn object correlations. We propose two improvements that +strengthen object correlation learning. The first is to condition the slots on +a global, scene-level variable that captures higher-order correlations between +slots. Second, we address the fundamental lack of a canonical order for objects +in images by proposing to learn a consistent order to use for the +autoregressive generation of scene objects. Specifically, we train an +autoregressive slot prior to sequentially generate scene objects following a +learned order. Ordered slot inference entails first estimating a randomly +ordered set of slots using existing approaches for extracting slots from +images, then aligning those slots to ordered slots generated autoregressively +with the slot prior. Our experiments across three multi-object environments +demonstrate clear gains in unconditional scene generation quality. Detailed +ablation studies are also provided that validate the two proposed improvements. + +
+
+ comment: Published in Neural Computation. 38 pages, 18 figures. Code and + videos available at https://github.com/pemami4911/segregate-relate-imagine +
+
+
+
+
+ + ♻ ☆ Context-lumpable stochastic bandits + + +
+ We consider a contextual bandit problem with $S$ contexts and $K$ actions. In +each round $t=1,2,\dots$, the learner observes a random context and chooses an +action based on its past experience. The learner then observes a random reward +whose mean is a function of the context and the action for the round. Under the +assumption that the contexts can be lumped into $r\le \min\{S,K\}$ groups such +that the mean reward for the various actions is the same for any two contexts +that are in the same group, we give an algorithm that outputs an +$\epsilon$-optimal policy after using at most $\widetilde O(r (S +K +)/\epsilon^2)$ samples with high probability and provide a matching +$\Omega(r(S+K)/\epsilon^2)$ lower bound. In the regret minimization setting, we +give an algorithm whose cumulative regret up to time $T$ is bounded by +$\widetilde O(\sqrt{r^3(S+K)T})$. To the best of our knowledge, we are the +first to show the near-optimal sample complexity in the PAC setting and +$\widetilde O(\sqrt{{poly}(r)(S+K)T})$ minimax regret in the online setting for +this problem. We also show our algorithms can be applied to more general +low-rank bandits and get improved regret bounds in some scenarios. + +
+
+
+
+
+ + ♻ ☆ ACHO: Adaptive Conformal Hyperparameter Optimization + + +
+ Several novel frameworks for hyperparameter search have emerged in the last +decade, but most rely on strict, often normal, distributional assumptions, +limiting search model flexibility. This paper proposes a novel optimization +framework based on upper confidence bound sampling of conformal confidence +intervals, whose weaker assumption of exchangeability enables greater choice of +search model architectures. Several such architectures were explored and +benchmarked on hyperparameter search of random forests and convolutional neural +networks, displaying satisfactory interval coverage and superior tuning +performance to random search. + +
+
+ comment: 12 pages, 4 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ MAPSeg: Unified Unsupervised Domain Adaptation for Heterogeneous Medical + Image Segmentation Based on 3D Masked Autoencoding and Pseudo-Labeling + + +
+ Robust segmentation is critical for deriving quantitative measures from +large-scale, multi-center, and longitudinal medical scans. Manually annotating +medical scans, however, is expensive and labor-intensive and may not always be +available in every domain. Unsupervised domain adaptation (UDA) is a +well-studied technique that alleviates this label-scarcity problem by +leveraging available labels from another domain. In this study, we introduce +Masked Autoencoding and Pseudo-Labeling Segmentation (MAPSeg), a +$\textbf{unified}$ UDA framework with great versatility and superior +performance for heterogeneous and volumetric medical image segmentation. To the +best of our knowledge, this is the first study that systematically reviews and +develops a framework to tackle four different domain shifts in medical image +segmentation. More importantly, MAPSeg is the first framework that can be +applied to $\textbf{centralized}$, $\textbf{federated}$, and +$\textbf{test-time}$ UDA while maintaining comparable performance. We compare +MAPSeg with previous state-of-the-art methods on a private infant brain MRI +dataset and a public cardiac CT-MRI dataset, and MAPSeg outperforms others by a +large margin (10.5 Dice improvement on the private MRI dataset and 5.7 on the +public CT-MRI dataset). MAPSeg poses great practical value and can be applied +to real-world problems. Our code and pretrained model will be available later. + +
+
+ comment: 16 pages and 7 figures. Revised and extended to test-time and + federated domain adaptation. Xuzhe Zhang and Yuhao Wu are co-first authors. + Andrew F. Laine and Yun Wang are co-senior supervising authors +
+
+
+
+
+ + ♻ ☆ Identifying Systems with Symmetries using Equivariant Autoregressive + Reservoir Computers + + +
+ The investigation reported in this document focuses on identifying systems +with symmetries using equivariant autoregressive reservoir computers. General +results in structured matrix approximation theory are presented, exploring a +two-fold approach. Firstly, a comprehensive examination of generic +symmetry-preserving nonlinear time delay embedding is conducted. This involves +analyzing time series data sampled from an equivariant system under study. +Secondly, sparse least-squares methods are applied to discern approximate +representations of the output coupling matrices. These matrices play a pivotal +role in determining the nonlinear autoregressive representation of an +equivariant system. The structural characteristics of these matrices are +dictated by the set of symmetries inherent in the system. The document outlines +prototypical algorithms derived from the described techniques, offering insight +into their practical applications. Emphasis is placed on their effectiveness in +the identification and predictive simulation of equivariant nonlinear systems, +regardless of whether such systems exhibit chaotic behavior. + +
+
+ comment: The views expressed in the article do not necessarily represent the + views of the National Commission of Banks and Insurance Companies of Honduras +
+
+
+
+
+ + ♻ ☆ Understanding the robustness difference between stochastic gradient + descent and adaptive gradient methods + + +
+ Stochastic gradient descent (SGD) and adaptive gradient methods, such as Adam +and RMSProp, have been widely used in training deep neural networks. We +empirically show that while the difference between the standard generalization +performance of models trained using these methods is small, those trained using +SGD exhibit far greater robustness under input perturbations. Notably, our +investigation demonstrates the presence of irrelevant frequencies in natural +datasets, where alterations do not affect models' generalization performance. +However, models trained with adaptive methods show sensitivity to these +changes, suggesting that their use of irrelevant frequencies can lead to +solutions sensitive to perturbations. To better understand this difference, we +study the learning dynamics of gradient descent (GD) and sign gradient descent +(signGD) on a synthetic dataset that mirrors natural signals. With a +three-dimensional input space, the models optimized with GD and signGD have +standard risks close to zero but vary in their adversarial risks. Our result +shows that linear models' robustness to $\ell_2$-norm bounded changes is +inversely proportional to the model parameters' weight norm: a smaller weight +norm implies better robustness. In the context of deep learning, our +experiments show that SGD-trained neural networks have smaller Lipschitz +constants, explaining the better robustness to input perturbations than those +trained with adaptive gradient methods. + +
+
+ comment: Accepted at TMLR (Featured Certification). Code: see + https://github.com/averyma/opt-robust +
+
+
+
+
+ + ♻ ☆ Learning sources of variability from high-dimensional observational + studies + + +
+ Causal inference studies whether the presence of a variable influences an +observed outcome. As measured by quantities such as the "average treatment +effect," this paradigm is employed across numerous biological fields, from +vaccine and drug development to policy interventions. Unfortunately, the +majority of these methods are often limited to univariate outcomes. Our work +generalizes causal estimands to outcomes with any number of dimensions or any +measurable space, and formulates traditional causal estimands for nominal +variables as causal discrepancy tests. We propose a simple technique for +adjusting universally consistent conditional independence tests and prove that +these tests are universally consistent causal discrepancy tests. Numerical +experiments illustrate that our method, Causal CDcorr, leads to improvements in +both finite sample validity and power when compared to existing strategies. Our +methods are all open source and available at github.com/ebridge2/cdcorr. + +
+
+
+
+
+ + ♻ ☆ LegendreTron: Uprising Proper Multiclass Loss Learning ICML 2023 + + +
+ Loss functions serve as the foundation of supervised learning and are often +chosen prior to model development. To avoid potentially ad hoc choices of +losses, statistical decision theory describes a desirable property for losses +known as \emph{properness}, which asserts that Bayes' rule is optimal. Recent +works have sought to \emph{learn losses} and models jointly. Existing methods +do this by fitting an inverse canonical link function which monotonically maps +$\mathbb{R}$ to $[0,1]$ to estimate probabilities for binary problems. In this +paper, we extend monotonicity to maps between $\mathbb{R}^{C-1}$ and the +projected probability simplex $\tilde{\Delta}^{C-1}$ by using monotonicity of +gradients of convex functions. We present {\sc LegendreTron} as a novel and +practical method that jointly learns \emph{proper canonical losses} and +probabilities for multiclass problems. Tested on a benchmark of domains with up +to 1,000 classes, our experimental results show that our method consistently +outperforms the natural multiclass baseline under a $t$-test at 99% +significance on all datasets with greater than 10 classes. + +
+
+ comment: Accepted at the 40th International Conference on Machine Learning + (ICML 2023) +
+
+
+
+
+ + ♻ ☆ A Counterfactual Safety Margin Perspective on the Scoring of Autonomous + Vehicles' Riskiness + + +
+ Autonomous Vehicles (AVs) promise a range of societal advantages, including +broader access to mobility, reduced road accidents, and enhanced transportation +efficiency. However, evaluating the risks linked to AVs is complex due to +limited historical data and the swift progression of technology. This paper +presents a data-driven framework for assessing the risk of different AVs' +behaviors in various operational design domains (ODDs), based on counterfactual +simulations of "misbehaving" road users. We propose the notion of +counterfactual safety margin, which represents the minimum deviation from +nominal behavior that could cause a collision. This methodology not only +pinpoints the most critical scenarios but also quantifies the (relative) risk's +frequency and severity concerning AVs. Importantly, we show that our approach +is applicable even when the AV's behavioral policy remains undisclosed, through +worst- and best-case analyses, benefiting external entities like regulators and +risk evaluators. Our experimental outcomes demonstrate the correlation between +the safety margin, the quality of the driving policy, and the ODD, shedding +light on the relative risks of different AV providers. Overall, this work +contributes to the safety assessment of AVs and addresses legislative and +insurance concerns surrounding this burgeoning technology. + +
+
+ comment: updated experiments +
+
+
+
+
+ + ♻ ☆ Easing Color Shifts in Score-Based Diffusion Models + + +
+ Generated images of score-based models can suffer from errors in their +spatial means, an effect, referred to as a color shift, which grows for larger +images. This paper investigates a previously-introduced approach to mitigate +color shifts in score-based diffusion models. We quantify the performance of a +nonlinear bypass connection in the score network, designed to process the +spatial mean of the input and to predict the mean of the score function. We +show that this network architecture substantially improves the resulting +quality of the generated images, and that this improvement is approximately +independent of the size of the generated images. As a result, this modified +architecture offers a simple solution for the color shift problem across image +sizes. We additionally discuss the origin of color shifts in an idealized +setting in order to motivate the approach. + +
+
+
+
+
+ + ♻ ☆ A Primer on Deep Learning for Causal Inference + + +
+ This review systematizes the emerging literature for causal inference using +deep neural networks under the potential outcomes framework. It provides an +intuitive introduction on how deep learning can be used to estimate/predict +heterogeneous treatment effects and extend causal inference to settings where +confounding is non-linear, time varying, or encoded in text, networks, and +images. To maximize accessibility, we also introduce prerequisite concepts from +causal inference and deep learning. The survey differs from other treatments of +deep learning and causal inference in its sharp focus on observational causal +estimation, its extended exposition of key algorithms, and its detailed +tutorials for implementing, training, and selecting among deep estimators in +Tensorflow 2 available at github.com/kochbj/Deep-Learning-for-Causal-Inference. + +
+
+ comment: Forthcoming in Sociological Methods and Research +
+
+
+
+
+ + ♻ ☆ NeuroBack: Improving CDCL SAT Solving using Graph Neural Networks + + +
+ Propositional satisfiability (SAT) is an NP-complete problem that impacts +many research fields, such as planning, verification, and security. Mainstream +modern SAT solvers are based on the Conflict-Driven Clause Learning (CDCL) +algorithm. Recent work aimed to enhance CDCL SAT solvers using Graph Neural +Networks (GNNs). However, so far this approach either has not made solving more +effective, or required substantial GPU resources for frequent online model +inferences. Aiming to make GNN improvements practical, this paper proposes an +approach called NeuroBack, which builds on two insights: (1) predicting phases +(i.e., values) of variables appearing in the majority (or even all) of the +satisfying assignments are essential for CDCL SAT solving, and (2) it is +sufficient to query the neural model only once for the predictions before the +SAT solving starts. Once trained, the offline model inference allows NeuroBack +to execute exclusively on the CPU, removing its reliance on GPU resources. To +train NeuroBack, a new dataset called DataBack containing 120,286 data samples +is created. Finally, NeuroBack is implemented as an enhancement to a +state-of-the-art SAT solver called Kissat. As a result, it allowed Kissat to +solve 5.2% more problems on the recent SAT competition problem set, +SATCOMP-2022. NeuroBack therefore shows how machine learning can be harnessed +to improve SAT solving in an effective and practical manner. + +
+
+
+
+
+ + ♻ ☆ A Baseline Analysis of Reward Models' Ability To Accurately Analyze + Foundation Models Under Distribution Shift + + +
+ Foundation models, specifically Large Language Models (LLM's), have lately +gained wide-spread attention and adoption. Reinforcement Learning with Human +Feedback (RLHF) involves training a reward model to capture desired behaviors, +which is then used to align an LLM. These reward models are additionally used +at inference-time to estimate how well LLM responses adhere to those desired +behaviors. However, there is little work measuring how robust these reward +models are to distribution shifts. In this work, we evaluate how reward model +performance - measured via accuracy and calibration (i.e. alignment between +accuracy and confidence) - is affected by distribution shift. We show novel +calibration patterns and accuracy drops due to OOD prompts and responses, and +that the reward model is more sensitive to shifts in responses than prompts. +Additionally, we adapt an OOD detection technique commonly used in +classification to the reward model setting in order to detect these +distribution shifts in prompts and responses. + +
+
+
+
+
+ + ♻ ☆ 3D helical CT Reconstruction with a Memory Efficient Learned Primal-Dual + Architecture + + +
+ Deep learning based computed tomography (CT) reconstruction has demonstrated +outstanding performance on simulated 2D low-dose CT data. This applies in +particular to domain adapted neural networks, which incorporate a handcrafted +physics model for CT imaging. Empirical evidence shows that employing such +architectures reduces the demand for training data and improves upon +generalisation. However, their training requires large computational resources +that quickly become prohibitive in 3D helical CT, which is the most common +acquisition geometry used for medical imaging. Furthermore, clinical data also +comes with other challenges not accounted for in simulations, like errors in +flux measurement, resolution mismatch and, most importantly, the absence of the +real ground truth. The necessity to have a computationally feasible training +combined with the need to address these issues has made it difficult to +evaluate deep learning based reconstruction on clinical 3D helical CT. This +paper modifies a domain adapted neural network architecture, the Learned +Primal-Dual (LPD), so that it can be trained and applied to reconstruction in +this setting. We achieve this by splitting the helical trajectory into sections +and applying the unrolled LPD iterations to those sections sequentially. To the +best of our knowledge, this work is the first to apply an unrolled deep +learning architecture for reconstruction on full-sized clinical data, like +those in the Low dose CT image and projection data set (LDCT). Moreover, +training and testing is done on a single GPU card with 24GB of memory. + +
+
+
+
+
+ + ♻ ☆ DeepDecipher: Accessing and Investigating Neuron Activation in Large + Language Models NeurIPS 2023 + + +
+ As large language models (LLMs) become more capable, there is an urgent need +for interpretable and transparent tools. Current methods are difficult to +implement, and accessible tools to analyze model internals are lacking. To +bridge this gap, we present DeepDecipher - an API and interface for probing +neurons in transformer models' MLP layers. DeepDecipher makes the outputs of +advanced interpretability techniques for LLMs readily available. The +easy-to-use interface also makes inspecting these complex models more +intuitive. This paper outlines DeepDecipher's design and capabilities. We +demonstrate how to analyze neurons, compare models, and gain insights into +model behavior. For example, we contrast DeepDecipher's functionality with +similar tools like Neuroscope and OpenAI's Neuron Explainer. DeepDecipher +enables efficient, scalable analysis of LLMs. By granting access to +state-of-the-art interpretability methods, DeepDecipher makes LLMs more +transparent, trustworthy, and safe. Researchers, engineers, and developers can +quickly diagnose issues, audit systems, and advance the field. + +
+
+ comment: 5 pages (9 total), 1 figure, submitted to NeurIPS 2023 Workshop XAIA +
+
+
+
+
+ + ♻ ☆ Hybrid Search for Efficient Planning with Completeness Guarantees NeurIPS 2023 + + +
+ Solving complex planning problems has been a long-standing challenge in +computer science. Learning-based subgoal search methods have shown promise in +tackling these problems, but they often suffer from a lack of completeness +guarantees, meaning that they may fail to find a solution even if one exists. +In this paper, we propose an efficient approach to augment a subgoal search +method to achieve completeness in discrete action spaces. Specifically, we +augment the high-level search with low-level actions to execute a multi-level +(hybrid) search, which we call complete subgoal search. This solution achieves +the best of both worlds: the practical efficiency of high-level search and the +completeness of low-level search. We apply the proposed search method to a +recently proposed subgoal search algorithm and evaluate the algorithm trained +on offline data on complex planning problems. We demonstrate that our complete +subgoal search not only guarantees completeness but can even improve +performance in terms of search expansions for instances that the high-level +could solve without low-level augmentations. Our approach makes it possible to +apply subgoal-level planning for systems where completeness is a critical +requirement. + +
+
+ comment: NeurIPS 2023 Poster +
+
+
+
+
+ + ♻ ☆ Quantum-probabilistic Hamiltonian learning for generative modelling & + anomaly detection + + +
+ The Hamiltonian of an isolated quantum mechanical system determines its +dynamics and physical behaviour. This study investigates the possibility of +learning and utilising a system's Hamiltonian and its variational thermal state +estimation for data analysis techniques. For this purpose, we employ the method +of Quantum Hamiltonian-based models for the generative modelling of simulated +Large Hadron Collider data and demonstrate the representability of such data as +a mixed state. In a further step, we use the learned Hamiltonian for anomaly +detection, showing that different sample types can form distinct dynamical +behaviours once treated as a quantum many-body system. We exploit these +characteristics to quantify the difference between sample types. Our findings +show that the methodologies designed for field theory computations can be +utilised in machine learning applications to employ theoretical approaches in +data analysis techniques. + +
+
+ comment: 14 pages, 7 figures. Accepted version for publication +
+
+
+
+
+ + ♻ ☆ On Separate Normalization in Self-supervised Transformers NIPS 2023 + + +
+ Self-supervised training methods for transformers have demonstrated +remarkable performance across various domains. Previous transformer-based +models, such as masked autoencoders (MAE), typically utilize a single +normalization layer for both the [CLS] symbol and the tokens. We propose in +this paper a simple modification that employs separate normalization layers for +the tokens and the [CLS] symbol to better capture their distinct +characteristics and enhance downstream task performance. Our method aims to +alleviate the potential negative effects of using the same normalization +statistics for both token types, which may not be optimally aligned with their +individual roles. We empirically show that by utilizing a separate +normalization layer, the [CLS] embeddings can better encode the global +contextual information and are distributed more uniformly in its anisotropic +space. When replacing the conventional normalization layer with the two +separate layers, we observe an average 2.7% performance improvement over the +image, natural language, and graph domains. + +
+
+ comment: NIPS 2023 +
+
+
+
+
+ + ♻ ☆ Explainability for Large Language Models: A Survey + + +
+ Large language models (LLMs) have demonstrated impressive capabilities in +natural language processing. However, their internal mechanisms are still +unclear and this lack of transparency poses unwanted risks for downstream +applications. Therefore, understanding and explaining these models is crucial +for elucidating their behaviors, limitations, and social impacts. In this +paper, we introduce a taxonomy of explainability techniques and provide a +structured overview of methods for explaining Transformer-based language +models. We categorize techniques based on the training paradigms of LLMs: +traditional fine-tuning-based paradigm and prompting-based paradigm. For each +paradigm, we summarize the goals and dominant approaches for generating local +explanations of individual predictions and global explanations of overall model +knowledge. We also discuss metrics for evaluating generated explanations, and +discuss how explanations can be leveraged to debug models and improve +performance. Lastly, we examine key challenges and emerging opportunities for +explanation techniques in the era of LLMs in comparison to conventional machine +learning models. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Viewport Prediction for Volumetric Video Streaming by Exploring Video + Saliency and Trajectory Information + + +
+ Volumetric video, also known as hologram video, is a novel medium that +portrays natural content in Virtual Reality (VR), Augmented Reality (AR), and +Mixed Reality (MR). It is expected to be the next-gen video technology and a +prevalent use case for 5G and beyond wireless communication. Considering that +each user typically only watches a section of the volumetric video, known as +the viewport, it is essential to have precise viewport prediction for optimal +performance. However, research on this topic is still in its infancy. In the +end, this paper presents and proposes a novel approach, named Saliency and +Trajectory Viewport Prediction (STVP), which aims to improve the precision of +viewport prediction in volumetric video streaming. The STVP extensively +utilizes video saliency information and viewport trajectory. To our knowledge, +this is the first comprehensive study of viewport prediction in volumetric +video streaming. In particular, we introduce a novel sampling method, Uniform +Random Sampling (URS), to reduce computational complexity while still +preserving video features in an efficient manner. Then we present a saliency +detection technique that incorporates both spatial and temporal information for +detecting static, dynamic geometric, and color salient regions. Finally, we +intelligently fuse saliency and trajectory information to achieve more accurate +viewport prediction. We conduct extensive simulations to evaluate the +effectiveness of our proposed viewport prediction methods using +state-of-the-art volumetric video sequences. The experimental results show the +superiority of the proposed method over existing schemes. The dataset and +source code will be publicly accessible after acceptance. + +
+
+
+
+
+ + ♻ ☆ PromptCARE: Prompt Copyright Protection by Watermark Injection and + Verification + + +
+ Large language models (LLMs) have witnessed a meteoric rise in popularity +among the general public users over the past few months, facilitating diverse +downstream tasks with human-level accuracy and proficiency. Prompts play an +essential role in this success, which efficiently adapt pre-trained LLMs to +task-specific applications by simply prepending a sequence of tokens to the +query texts. However, designing and selecting an optimal prompt can be both +expensive and demanding, leading to the emergence of Prompt-as-a-Service +providers who profit by providing well-designed prompts for authorized use. +With the growing popularity of prompts and their indispensable role in +LLM-based services, there is an urgent need to protect the copyright of prompts +against unauthorized use. + In this paper, we propose PromptCARE, the first framework for prompt +copyright protection through watermark injection and verification. Prompt +watermarking presents unique challenges that render existing watermarking +techniques developed for model and dataset copyright verification ineffective. +PromptCARE overcomes these hurdles by proposing watermark injection and +verification schemes tailor-made for prompts and NLP characteristics. Extensive +experiments on six well-known benchmark datasets, using three prevalent +pre-trained LLMs (BERT, RoBERTa, and Facebook OPT-1.3b), demonstrate the +effectiveness, harmlessness, robustness, and stealthiness of PromptCARE. + +
+
+ comment: To Appear in the 45th IEEE Symposium on Security and Privacy 2024, + code is available at: https://github.com/grasses/PromptCARE +
+
+
+
+
+ + ♻ ☆ CompenHR: Efficient Full Compensation for High-resolution Projector + + +
+ Full projector compensation is a practical task of projector-camera systems. +It aims to find a projector input image, named compensation image, such that +when projected it cancels the geometric and photometric distortions due to the +physical environment and hardware. State-of-the-art methods use deep learning +to address this problem and show promising performance for low-resolution +setups. However, directly applying deep learning to high-resolution setups is +impractical due to the long training time and high memory cost. To address this +issue, this paper proposes a practical full compensation solution. Firstly, we +design an attention-based grid refinement network to improve geometric +correction quality. Secondly, we integrate a novel sampling scheme into an +end-to-end compensation network to alleviate computation and introduce +attention blocks to preserve key features. Finally, we construct a benchmark +dataset for high-resolution projector full compensation. In experiments, our +method demonstrates clear advantages in both efficiency and quality. + +
+
+
+
+
+ + ♻ ☆ M$^{2}$UGen: Multi-modal Music Understanding and Generation with the + Power of Large Language Models + + +
+ The current landscape of research leveraging large language models (LLMs) is +experiencing a surge. Many works harness the powerful reasoning capabilities of +these models to comprehend various modalities, such as text, speech, images, +videos, etc. They also utilize LLMs to understand human intention and generate +desired outputs like images, videos, and music. However, research that combines +both understanding and generation using LLMs is still limited and in its +nascent stage. To address this gap, we introduce a Multi-modal Music +Understanding and Generation (M$^{2}$UGen) framework that integrates LLM's +abilities to comprehend and generate music for different modalities. The +M$^{2}$UGen framework is purpose-built to unlock creative potential from +diverse sources of inspiration, encompassing music, image, and video through +the use of pretrained MERT, ViT, and ViViT models, respectively. To enable +music generation, we explore the use of AudioLDM 2 and MusicGen. Bridging +multi-modal understanding and music generation is accomplished through the +integration of the LLaMA 2 model. Furthermore, we make use of the MU-LLaMA +model to generate extensive datasets that support text/image/video-to-music +generation, facilitating the training of our M$^{2}$UGen framework. We conduct +a thorough evaluation of our proposed framework. The experimental results +demonstrate that our model achieves or surpasses the performance of the current +state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ E-polis: A serious game for the gamification of sociological surveys + + +
+ E-polis is a multi-platform serious game that gamifies a sociological survey +for studying young people's opinions regarding their ideal society. The +gameplay is based on a user navigating through a digital city, experiencing the +changes inflicted, triggered by responses to social and pedagogical surveys, +known as "dilemmas". The game integrates elements of adventure, exploration, +and simulation. Unity was the selected game engine used for the development of +the game, while a middleware component was also developed to gather and process +the users' data. At the end of each game, users are presented with a blueprint +of the city they navigated to showcase how their choices influenced its +development. This motivates them to reflect on their answers and validate them. +The game can be used to collect data on a variety of topics, such as social +justice, and economic development, or to promote civic engagement and encourage +young people to think critically about the world around them. + +
+
+ comment: 8 pages, 11 figures, Proceedings of the International Conference on + Applied Mathematics & Computer Science (ICAMCS) 2023 +
+
+
+
+
+ + ♻ ☆ MCPNS: A Macropixel Collocated Position and Its Neighbors Search for + Plenoptic 2.0 Video Coding + + +
+ Recently, it was demonstrated that a newly focused plenoptic 2.0 camera can +capture much higher spatial resolution owing to its effective light field +sampling, as compared to a traditional unfocused plenoptic 1.0 camera. However, +due to the nature difference of the optical structure between the plenoptic 1.0 +and 2.0 cameras, the existing fast motion estimation (ME) method for plenoptic +1.0 videos is expected to be sub-optimal for encoding plenoptic 2.0 videos. In +this paper, we point out the main motion characteristic differences between +plenoptic 1.0 and 2.0 videos and then propose a new fast ME, called macropixel +collocated position and its neighbors search (MCPNS) for plenoptic 2.0 videos. +In detail, we propose to reduce the number of macropixel collocated position +(MCP) search candidates based on the new observation of center-biased motion +vector distribution at macropixel resolution. After that, due to large motion +deviation behavior around each MCP location in plenoptic 2.0 videos, we propose +to select a certain number of key MCP locations with the lowest matching cost +to perform the neighbors MCP search to improve the motion search accuracy. +Different from existing methods, our method can achieve better performance +without requiring prior knowledge of microlens array orientations. Our +simulation results confirmed the effectiveness of the proposed algorithm in +terms of both bitrate savings and computational costs compared to existing +methods. + +
+
+ comment: Under review +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 76 + +
+
+
+ + ☆ How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for + Vision LLMs SC + + +
+ This work focuses on the potential of Vision LLMs (VLLMs) in visual +reasoning. Different from prior studies, we shift our focus from evaluating +standard performance to introducing a comprehensive safety evaluation suite, +covering both out-of-distribution (OOD) generalization and adversarial +robustness. For the OOD evaluation, we present two novel VQA datasets, each +with one variant, designed to test model performance under challenging +conditions. In exploring adversarial robustness, we propose a straightforward +attack strategy for misleading VLLMs to produce visual-unrelated responses. +Moreover, we assess the efficacy of two jailbreaking strategies, targeting +either the vision or language component of VLLMs. Our evaluation of 21 diverse +models, ranging from open-source VLLMs to GPT-4V, yields interesting +observations: 1) Current VLLMs struggle with OOD texts but not images, unless +the visual information is limited; and 2) These VLLMs can be easily misled by +deceiving vision encoders only, and their vision-language training often +compromise safety protocols. We release this safety evaluation suite at +https://github.com/UCSC-VLAA/vllm-safety-benchmark. + +
+
+ comment: H.T., C.C., and Z.W. contribute equally. Work done during H.T. and + Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC +
+
+
+
+
+ + ☆ DUnE: Dataset for Unified Editing EMNLP 2023 + + +
+ Even the most advanced language models remain susceptible to errors +necessitating to modify these models without initiating a comprehensive +retraining process. Model editing refers to the modification of a model's +knowledge or representations in a manner that produces the desired outcomes. +Prior research primarily centered around editing factual data e.g. "Messi plays +for Inter Miami" confining the definition of an edit to a knowledge triplet +i.e. (subject, object, relation). However, as the applications of language +models expand, so do the diverse ways in which we wish to edit and refine their +outputs. In this study, we broaden the scope of the editing problem to include +an array of editing cases such as debiasing and rectifying reasoning errors and +define an edit as any natural language expression that solicits a change in the +model's outputs. We are introducing DUnE-an editing benchmark where edits are +natural language sentences and propose that DUnE presents a challenging yet +relevant task. To substantiate this claim, we conduct an extensive series of +experiments testing various editing approaches to address DUnE, demonstrating +their respective strengths and weaknesses. We show that retrieval-augmented +language modeling can outperform specialized editing techniques and neither set +of approaches has fully solved the generalized editing problem covered by our +benchmark. + +
+
+ comment: Accepted at EMNLP 2023 +
+
+
+
+
+ + ☆ BERT Goes Off-Topic: Investigating the Domain Transfer Challenge using + Genre Classification EMNLP'2023 + + +
+ While performance of many text classification tasks has been recently +improved due to Pre-trained Language Models (PLMs), in this paper we show that +they still suffer from a performance gap when the underlying distribution of +topics changes. For example, a genre classifier trained on \textit{political} +topics often fails when tested on documents about \textit{sport} or +\textit{medicine}. In this work, we quantify this phenomenon empirically with a +large corpus and a large set of topics. Consequently, we verify that domain +transfer remains challenging both for classic PLMs, such as BERT, and for +modern large models, such as GPT-3. We also suggest and successfully test a +possible remedy: after augmenting the training dataset with +topically-controlled synthetic texts, the F1 score improves by up to 50\% for +some topics, nearing on-topic training results, while others show little to no +improvement. While our empirical results focus on genre classification, our +methodology is applicable to other classification tasks such as gender, +authorship, or sentiment classification. The code and data to replicate the +experiments are available at https://github.com/dminus1/genre + +
+
+ comment: Published at EMNLP'2023 +
+
+
+
+
+ + ☆ MEDITRON-70B: Scaling Medical Pretraining for Large Language Models + + +
+ Large language models (LLMs) can potentially democratize access to medical +knowledge. While many efforts have been made to harness and improve LLMs' +medical knowledge and reasoning capacities, the resulting models are either +closed-source (e.g., PaLM, GPT-4) or limited in scale (<= 13B parameters), +which restricts their abilities. In this work, we improve access to large-scale +medical LLMs by releasing MEDITRON: a suite of open-source LLMs with 7B and 70B +parameters adapted to the medical domain. MEDITRON builds on Llama-2 (through +our adaptation of Nvidia's Megatron-LM distributed trainer), and extends +pretraining on a comprehensively curated medical corpus, including selected +PubMed articles, abstracts, and internationally-recognized medical guidelines. +Evaluations using four major medical benchmarks show significant performance +gains over several state-of-the-art baselines before and after task-specific +finetuning. Overall, MEDITRON achieves a 6% absolute performance gain over the +best public baseline in its parameter class and 3% over the strongest baseline +we finetuned from Llama-2. Compared to closed-source LLMs, MEDITRON-70B +outperforms GPT-3.5 and Med-PaLM and is within 5% of GPT-4 and 10% of +Med-PaLM-2. We release our code for curating the medical pretraining corpus and +the MEDITRON model weights to drive open-source development of more capable +medical LLMs. + +
+
+
+
+
+ + ☆ BioLORD-2023: Semantic Textual Representations Fusing LLM and Clinical + Knowledge Graph Insights + + +
+ In this study, we investigate the potential of Large Language Models to +complement biomedical knowledge graphs in the training of semantic models for +the biomedical and clinical domains. Drawing on the wealth of the UMLS +knowledge graph and harnessing cutting-edge Large Language Models, we propose a +new state-of-the-art approach for obtaining high-fidelity representations of +biomedical concepts and sentences, consisting of three steps: an improved +contrastive learning phase, a novel self-distillation phase, and a weight +averaging phase. Through rigorous evaluations via the extensive BioLORD testing +suite and diverse downstream tasks, we demonstrate consistent and substantial +performance improvements over the previous state of the art (e.g. +2pts on +MedSTS, +2.5pts on MedNLI-S, +6.1pts on EHR-Rel-B). Besides our new +state-of-the-art biomedical model for English, we also distill and release a +multilingual model compatible with 50+ languages and finetuned on 7 European +languages. Many clinical pipelines can benefit from our latest models. Our new +multilingual model enables a range of languages to benefit from our +advancements in biomedical semantic representation learning, opening a new +avenue for bioinformatics researchers around the world. As a result, we hope to +see BioLORD-2023 becoming a precious tool for future biomedical applications. + +
+
+ comment: Preprint of upcoming journal article +
+
+
+
+
+ + ☆ Sparsify-then-Classify: From Internal Neurons of Large Language Models + To Efficient Text Classifiers + + +
+ Among the many tasks that Large Language Models (LLMs) have revolutionized is +text classification. However, existing approaches for applying pretrained LLMs +to text classification predominantly rely on using single token outputs from +only the last layer of hidden states. As a result, they suffer from limitations +in efficiency, task-specificity, and interpretability. In our work, we +contribute an approach that uses all internal representations by employing +multiple pooling strategies on all activation and hidden states. Our novel +lightweight strategy, Sparsify-then-Classify (STC) first sparsifies +task-specific features layer-by-layer, then aggregates across layers for text +classification. STC can be applied as a seamless plug-and-play module on top of +existing LLMs. Our experiments on a comprehensive set of models and datasets +demonstrate that STC not only consistently improves the classification +performance of pretrained and fine-tuned models, but is also more efficient for +both training and inference, and is more intrinsically interpretable. + +
+
+ comment: 23 pages, 5 figures, 8 tables Code available at + https://github.com/difanj0713/Sparsify-then-Classify +
+
+
+
+
+ + ☆ Efficient Pre-training for Localized Instruction Generation of Videos + + +
+ Procedural videos show step-by-step demonstrations of tasks like recipe +preparation. Understanding such videos is challenging, involving the precise +localization of steps and the generation of textual instructions. Manually +annotating steps and writing instructions is costly, which limits the size of +current datasets and hinders effective learning. Leveraging large but noisy +video-transcript datasets for pre-training can boost performance, but demands +significant computational resources. Furthermore, transcripts contain +irrelevant content and exhibit style variation compared to instructions written +by human annotators. To mitigate both issues, we propose a technique, +Sieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters +irrelevant transcripts and (ii) Swap enhances the quality of the text +instruction by automatically replacing the transcripts with human-written +instructions from a text-only recipe dataset. The curated dataset, three orders +of magnitude smaller than current web-scale datasets, enables efficient +training of large-scale models with competitive performance. We complement our +Sieve-\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step +localization and instruction generation for procedural videos. When this model +is pre-trained on our curated dataset, it achieves state-of-the-art performance +in zero-shot and finetuning settings on YouCook2 and Tasty, while using a +fraction of the computational resources. + +
+
+
+
+
+ + ☆ A Quantitative Approach to Understand Self-Supervised Models as + Cross-lingual Feature Extractors + + +
+ In this work, we study the features extracted by English self-supervised +learning (SSL) models in cross-lingual contexts and propose a new metric to +predict the quality of feature representations. Using automatic speech +recognition (ASR) as a downstream task, we analyze the effect of model size, +training objectives, and model architecture on the models' performance as a +feature extractor for a set of topologically diverse corpora. We develop a +novel metric, the Phonetic-Syntax Ratio (PSR), to measure the phonetic and +synthetic information in the extracted representations using deep generalized +canonical correlation analysis. Results show the contrastive loss in the +wav2vec2.0 objective facilitates more effective cross-lingual feature +extraction. There is a positive correlation between PSR scores and ASR +performance, suggesting that phonetic information extracted by monolingual SSL +models can be used for downstream tasks in cross-lingual settings. The proposed +metric is an effective indicator of the quality of the representations and can +be useful for model selection. + +
+
+ comment: 12 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Leveraging deep active learning to identify low-resource mobility + functioning information in public clinical notes + + +
+ Function is increasingly recognized as an important indicator of whole-person +health, although it receives little attention in clinical natural language +processing research. We introduce the first public annotated dataset +specifically on the Mobility domain of the International Classification of +Functioning, Disability and Health (ICF), aiming to facilitate automatic +extraction and analysis of functioning information from free-text clinical +notes. We utilize the National NLP Clinical Challenges (n2c2) research dataset +to construct a pool of candidate sentences using keyword expansion. Our active +learning approach, using query-by-committee sampling weighted by density +representativeness, selects informative sentences for human annotation. We +train BERT and CRF models, and use predictions from these models to guide the +selection of new sentences for subsequent annotation iterations. Our final +dataset consists of 4,265 sentences with a total of 11,784 entities, including +5,511 Action entities, 5,328 Mobility entities, 306 Assistance entities, and +639 Quantification entities. The inter-annotator agreement (IAA), averaged over +all entity types, is 0.72 for exact matching and 0.91 for partial matching. We +also train and evaluate common BERT models and state-of-the-art Nested NER +models. The best F1 scores are 0.84 for Action, 0.7 for Mobility, 0.62 for +Assistance, and 0.71 for Quantification. Empirical results demonstrate +promising potential of NER models to accurately extract mobility functioning +information from clinical text. The public availability of our annotated +dataset will facilitate further research to comprehensively capture functioning +information in electronic health records (EHRs). + +
+
+
+
+
+ + ☆ Tell2Design: A Dataset for Language-Guided Floor Plan Generation ACL2023 + + +
+ We consider the task of generating designs directly from natural language +descriptions, and consider floor plan generation as the initial research area. +Language conditional generative models have recently been very successful in +generating high-quality artistic images. However, designs must satisfy +different constraints that are not present in generating artistic images, +particularly spatial and relational constraints. We make multiple contributions +to initiate research on this task. First, we introduce a novel dataset, +\textit{Tell2Design} (T2D), which contains more than $80k$ floor plan designs +associated with natural language instructions. Second, we propose a +Sequence-to-Sequence model that can serve as a strong baseline for future +research. Third, we benchmark this task with several text-conditional image +generation models. We conclude by conducting human evaluations on the generated +samples and providing an analysis of human performance. We hope our +contributions will propel the research on language-guided design generation +forward. + +
+
+ comment: Paper published in ACL2023; Area Chair Award; Best Paper Nomination +
+
+
+
+
+ + ☆ WorldSense: A Synthetic Benchmark for Grounded Reasoning in Large + Language Models + + +
+ We propose WorldSense, a benchmark designed to assess the extent to which +LLMs are consistently able to sustain tacit world models, by testing how they +draw simple inferences from descriptions of simple arrangements of entities. +Worldsense is a synthetic benchmark with three problem types, each with their +own trivial control, which explicitly avoids bias by decorrelating the abstract +structure of problems from the vocabulary and expressions, and by decorrelating +all problem subparts with the correct response. We run our benchmark on three +state-of-the-art chat-LLMs (GPT3.5, GPT4 and Llama2-chat) and show that these +models make errors even with as few as three objects. Furthermore, they have +quite heavy response biases, preferring certain responses irrespective of the +question. Errors persist even with chain-of-thought prompting and in-context +learning. Lastly, we show that while finetuning on similar problems does result +in substantial improvements -- within- and out-of-distribution -- the finetuned +models do not generalise beyond a constraint problem space. + +
+
+
+
+
+ + ☆ Data Generation for Post-OCR correction of Cyrillic handwriting + + +
+ This paper introduces a novel approach to post-Optical Character Recognition +Correction (POC) for handwritten Cyrillic text, addressing a significant gap in +current research methodologies. This gap is due to the lack of large text +corporas that provide OCR errors for further training of language-based POC +models, which are demanding in terms of corpora size. Our study primarily +focuses on the development and application of a synthetic handwriting +generation engine based on B\'ezier curves. Such an engine generates highly +realistic handwritten text in any amounts, which we utilize to create a +substantial dataset by transforming Russian text corpora sourced from the +internet. We apply a Handwritten Text Recognition (HTR) model to this dataset +to identify OCR errors, forming the basis for our POC model training. The +correction model is trained on a 90-symbol input context, utilizing a +pre-trained T5 architecture with a seq2seq correction task. We evaluate our +approach on HWR200 and School_notebooks_RU datasets as they provide significant +challenges in the HTR domain. Furthermore, POC can be used to highlight errors +for teachers, evaluating student performance. This can be done simply by +comparing sentences before and after correction, displaying differences in +text. Our primary contribution lies in the innovative use of B\'ezier curves +for Cyrillic text generation and subsequent error correction using a +specialized POC model. We validate our approach by presenting Word Accuracy +Rate (WAR) and Character Accuracy Rate (CAR) results, both with and without +post-OCR correction, using real open corporas of handwritten Cyrillic text. +These results, coupled with our methodology, are designed to be reproducible, +paving the way for further advancements in the field of OCR and handwritten +text analysis. Paper contributions can be found in +https://github.com/dbrainio/CyrillicHandwritingPOC + +
+
+ comment: 17 pages, 27 figures, 6 tables, 26 references +
+
+
+
+
+ + ☆ YUAN 2.0: A Large Language Model with Localized Filtering-based + Attention + + +
+ In this work, the Localized Filtering-based Attention (LFA) is introduced to +incorporate prior knowledge of local dependencies of natural language into +Attention. Based on LFA, we develop and release Yuan 2.0, a large language +model with parameters ranging from 2.1 billion to 102.6 billion. A data +filtering and generation method is presented to build pretraining and +fine-tuning dataset in high quality. A distributed training method with +non-uniform pipeline parallel, data parallel, and optimizer parallel is +proposed, which greatly reduces the bandwidth requirements of intra-node +communication, and achieves good performance in large-scale distributed +training. Yuan 2.0 models display impressive ability in code generation, math +problem-solving, and chat compared with existing models. The latest version of +YUAN 2.0, including model weights and source code, is accessible at Github. + +
+
+
+
+
+ + ☆ Increasing Coverage and Precision of Textual Information in Multilingual + Knowledge Graphs EMNLP 2023 + + +
+ Recent work in Natural Language Processing and Computer Vision has been using +textual information -- e.g., entity names and descriptions -- available in +knowledge graphs to ground neural models to high-quality structured data. +However, when it comes to non-English languages, the quantity and quality of +textual information are comparatively scarce. To address this issue, we +introduce the novel task of automatic Knowledge Graph Enhancement (KGE) and +perform a thorough investigation on bridging the gap in both the quantity and +quality of textual information between English and non-English languages. More +specifically, we: i) bring to light the problem of increasing multilingual +coverage and precision of entity names and descriptions in Wikidata; ii) +demonstrate that state-of-the-art methods, namely, Machine Translation (MT), +Web Search (WS), and Large Language Models (LLMs), struggle with this task; +iii) present M-NTA, a novel unsupervised approach that combines MT, WS, and +LLMs to generate high-quality textual information; and, iv) study the impact of +increasing multilingual coverage and precision of non-English textual +information in Entity Linking, Knowledge Graph Completion, and Question +Answering. As part of our effort towards better multilingual knowledge graphs, +we also introduce WikiKGE-10, the first human-curated benchmark to evaluate KGE +approaches in 10 languages across 7 language families. + +
+
+ comment: Camera ready for EMNLP 2023 +
+
+
+
+
+ + ☆ Knowledge Unlearning for LLMs: Tasks, Methods, and Challenges + + +
+ In recent years, large language models (LLMs) have spurred a new research +paradigm in natural language processing. Despite their excellent capability in +knowledge-based question answering and reasoning, their potential to retain +faulty or even harmful knowledge poses risks of malicious application. The +challenge of mitigating this issue and transforming these models into purer +assistants is crucial for their widespread applicability. Unfortunately, +Retraining LLMs repeatedly to eliminate undesirable knowledge is impractical +due to their immense parameters. Knowledge unlearning, derived from analogous +studies on machine unlearning, presents a promising avenue to address this +concern and is notably advantageous in the context of LLMs. It allows for the +removal of harmful knowledge in an efficient manner, without affecting +unrelated knowledge in the model. To this end, we provide a survey of knowledge +unlearning in the era of LLMs. Firstly, we formally define the knowledge +unlearning problem and distinguish it from related works. Subsequently, we +categorize existing knowledge unlearning methods into three classes: those +based on parameter optimization, parameter merging, and in-context learning, +and introduce details of these unlearning methods. We further present +evaluation datasets used in existing methods, and finally conclude this survey +by presenting the ongoing challenges and future directions. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Towards Vision Enhancing LLMs: Empowering Multimodal Knowledge Storage + and Sharing in LLMs + + +
+ Recent advancements in multimodal large language models (MLLMs) have achieved +significant multimodal generation capabilities, akin to GPT-4. These models +predominantly map visual information into language representation space, +leveraging the vast knowledge and powerful text generation abilities of LLMs to +produce multimodal instruction-following responses. We could term this method +as LLMs for Vision because of its employing LLMs for visual-language +understanding, yet observe that these MLLMs neglect the potential of harnessing +visual knowledge to enhance overall capabilities of LLMs, which could be +regraded as Vision Enhancing LLMs. In this paper, we propose an approach called +MKS2, aimed at enhancing LLMs through empowering Multimodal Knowledge Storage +and Sharing in LLMs. Specifically, we introduce the Modular Visual Memory, a +component integrated into the internal blocks of LLMs, designed to store +open-world visual information efficiently. Additionally, we present a soft +Mixtures-of-Multimodal Experts architecture in LLMs to invoke multimodal +knowledge collaboration during generation. Our comprehensive experiments +demonstrate that MKS2 substantially augments the reasoning capabilities of LLMs +in contexts necessitating physical or commonsense knowledge. It also delivers +competitive results on multimodal benchmarks. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Italian Crossword Generator: Enhancing Education through Interactive + Word Puzzles + + +
+ Educational crosswords offer numerous benefits for students, including +increased engagement, improved understanding, critical thinking, and memory +retention. Creating high-quality educational crosswords can be challenging, but +recent advances in natural language processing and machine learning have made +it possible to use language models to generate nice wordplays. The exploitation +of cutting-edge language models like GPT3-DaVinci, GPT3-Curie, GPT3-Babbage, +GPT3-Ada, and BERT-uncased has led to the development of a comprehensive system +for generating and verifying crossword clues. A large dataset of clue-answer +pairs was compiled to fine-tune the models in a supervised manner to generate +original and challenging clues from a given keyword. On the other hand, for +generating crossword clues from a given text, Zero/Few-shot learning techniques +were used to extract clues from the input text, adding variety and creativity +to the puzzles. We employed the fine-tuned model to generate data and labeled +the acceptability of clue-answer parts with human supervision. To ensure +quality, we developed a classifier by fine-tuning existing language models on +the labeled dataset. Conversely, to assess the quality of clues generated from +the given text using zero/few-shot learning, we employed a zero-shot learning +approach to check the quality of generated clues. The results of the evaluation +have been very promising, demonstrating the effectiveness of the approach in +creating high-standard educational crosswords that offer students engaging and +rewarding learning experiences. + +
+
+ comment: Accepted Paper for CLiC-it 2023 - 9th Italian Conference on + Computational Linguistics +
+
+
+
+
+ + ☆ Justifiable Artificial Intelligence: Engineering Large Language Models + for Legal Applications + + +
+ In this work, I discuss how Large Language Models can be applied in the legal +domain, circumventing their current drawbacks. Despite their large success and +acceptance, their lack of explainability hinders legal experts to trust in +their output, and this happens rightfully so. However, in this paper, I argue +in favor of a new view, Justifiable Artificial Intelligence, instead of +focusing on Explainable Artificial Intelligence. I discuss in this paper how +gaining evidence for and against a Large Language Model's output may make their +generated texts more trustworthy - or hold them accountable for misinformation. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Cerbero-7B: A Leap Forward in Language-Specific LLMs Through Enhanced + Chat Corpus Generation and Evaluation + + +
+ This study introduces a novel approach for generating high-quality, +language-specific chat corpora using a self-chat mechanism. We combine a +generator LLM for creating new samples and an embedder LLM to ensure diversity. +A new Masked Language Modelling (MLM) model-based quality assessment metric is +proposed for evaluating and filtering the corpora. Utilizing the llama2-70b as +the generator and a multilingual sentence transformer as embedder, we generate +an Italian chat corpus and refine the Fauno corpus, which is based on +translated English ChatGPT self-chat data. The refinement uses structural +assertions and Natural Language Processing techniques. Both corpora undergo a +comprehensive quality evaluation using the proposed MLM model-based quality +metric. The Italian LLM fine-tuned with these corpora demonstrates +significantly enhanced language comprehension and question-answering skills. +The resultant model, cerbero-7b, establishes a new state-of-the-art for Italian +LLMs. This approach marks a substantial advancement in the development of +language-specific LLMs, with a special emphasis on augmenting corpora for +underrepresented languages like Italian. + +
+
+
+
+
+ + ☆ MoDS: Model-oriented Data Selection for Instruction Tuning + + +
+ Instruction tuning has become the de facto method to equip large language +models (LLMs) with the ability of following user instructions. Usually, +hundreds of thousands or millions of instruction-following pairs are employed +to fine-tune the foundation LLMs. Recently, some studies show that a small +number of high-quality instruction data is enough. However, how to select +appropriate instruction data for a given LLM is still an open problem. To +address this problem, in this paper we present a model-oriented data selection +(MoDS) approach, which selects instruction data based on a new criteria +considering three aspects: quality, coverage and necessity. First, our approach +utilizes a quality evaluation model to filter out the high-quality subset from +the original instruction dataset, and then designs an algorithm to further +select from the high-quality subset a seed instruction dataset with good +coverage. The seed dataset is applied to fine-tune the foundation LLM to obtain +an initial instruction-following LLM. Finally, we develop a necessity +evaluation model to find out the instruction data which are performed badly in +the initial instruction-following LLM and consider them necessary instructions +to further improve the LLMs. In this way, we can get a small high-quality, +broad-coverage and high-necessity subset from the original instruction +datasets. Experimental results show that, the model fine-tuned with 4,000 +instruction pairs selected by our approach could perform better than the model +fine-tuned with the full original dataset which includes 214k instruction data. + +
+
+
+
+
+ + ☆ Reinforcement Learning from Diffusion Feedback: Q* for Image Search + + +
+ Large vision-language models are steadily gaining personalization +capabilities at the cost of fine-tuning or data augmentation. We present two +models for image generation using model-agnostic learning that align semantic +priors with generative capabilities. RLDF, or Reinforcement Learning from +Diffusion Feedback, is a singular approach for visual imitation through +prior-preserving reward function guidance. This employs Q-learning (with +standard Q*) for generation and follows a semantic-rewarded trajectory for +image search through finite encoding-tailored actions. The second proposed +method, noisy diffusion gradient, is optimization driven. At the root of both +methods is a special CFG encoding that we propose for continual semantic +guidance. Using only a single input image and no text input, RLDF generates +high-quality images over varied domains including retail, sports and +agriculture showcasing class-consistency and strong visual diversity. Project +website is available at https://infernolia.github.io/RLDF. + +
+
+
+
+
+ + ☆ InfoPattern: Unveiling Information Propagation Patterns in Social Media + + +
+ Social media play a significant role in shaping public opinion and +influencing ideological communities through information propagation. Our demo +InfoPattern centers on the interplay between language and human ideology. The +demo (Code: https://github.com/blender-nlp/InfoPattern ) is capable of: (1) red +teaming to simulate adversary responses from opposite ideology communities; (2) +stance detection to identify the underlying political sentiments in each +message; (3) information propagation graph discovery to reveal the evolution of +claims across various communities over time. (Live Demo: +https://incas.csl.illinois.edu/blender/About ) + +
+
+
+
+
+ + ☆ The WebCrow French Crossword Solver + + +
+ Crossword puzzles are one of the most popular word games, played in different +languages all across the world, where riddle style can vary significantly from +one country to another. Automated crossword resolution is challenging, and +typical solvers rely on large databases of previously solved crosswords. In +this work, we extend WebCrow 2.0, an automatic crossword solver, to French, +making it the first program for crossword solving in the French language. To +cope with the lack of a large repository of clue-answer crossword data, WebCrow +2.0 exploits multiple modules, called experts, that retrieve candidate answers +from heterogeneous resources, such as the web, knowledge graphs, and linguistic +rules. We compared WebCrow's performance against humans in two different +challenges. Despite the limited amount of past crosswords, French WebCrow was +competitive, actually outperforming humans in terms of speed and accuracy, thus +proving its capabilities to generalize to new languages. + +
+
+ comment: Accepted Paper for EAI Intetain 2023 - 14th EAI International + Conference on Intelligent Technologies for Interactive Entertainment +
+
+
+
+
+ + ☆ Injecting linguistic knowledge into BERT for Dialogue State Tracking + + +
+ Dialogue State Tracking (DST) models often employ intricate neural network +architectures, necessitating substantial training data, and their inference +processes lack transparency. This paper proposes a method that extracts +linguistic knowledge via an unsupervised framework and subsequently utilizes +this knowledge to augment BERT's performance and interpretability in DST tasks. +The knowledge extraction procedure is computationally economical and does not +necessitate annotations or additional training data. The injection of the +extracted knowledge necessitates the addition of only simple neural modules. We +employ the Convex Polytopic Model (CPM) as a feature extraction tool for DST +tasks and illustrate that the acquired features correlate with the syntactic +and semantic patterns in the dialogues. This correlation facilitates a +comprehensive understanding of the linguistic features influencing the DST +model's decision-making process. We benchmark this framework on various DST +tasks and observe a notable improvement in accuracy. + +
+
+
+
+
+ + ☆ FreeAL: Towards Human-Free Active Learning in the Era of Large Language + Models EMNLP 2023 + + +
+ Collecting high-quality labeled data for model training is notoriously +time-consuming and labor-intensive for various NLP tasks. While copious +solutions, such as active learning for small language models (SLMs) and +prevalent in-context learning in the era of large language models (LLMs), have +been proposed and alleviate the labeling burden to some extent, their +performances are still subject to human intervention. It is still underexplored +how to reduce the annotation cost in the LLMs era. To bridge this, we +revolutionize traditional active learning and propose an innovative +collaborative learning framework FreeAL to interactively distill and filter the +task-specific knowledge from LLMs. During collaborative training, an LLM serves +as an active annotator inculcating its coarse-grained knowledge, while a +downstream SLM is incurred as a student to filter out high-quality in-context +samples to feedback LLM for the subsequent label refinery. Extensive +experiments on eight benchmark datasets demonstrate that FreeAL largely +enhances the zero-shot performances for both SLM and LLM without any human +supervision. The code is available at https://github.com/Justherozen/FreeAL . + +
+
+ comment: Accepted to EMNLP 2023 (Main conference) +
+
+
+
+
+ + ☆ Can Vision-Language Models Think from a First-Person Perspective? + + +
+ Vision-language models (VLMs) have recently shown promising results in +traditional downstream tasks. Evaluation studies have emerged to assess their +abilities, with the majority focusing on the third-person perspective, and only +a few addressing specific tasks from the first-person perspective. However, the +capability of VLMs to "think" from a first-person perspective, a crucial +attribute for advancing autonomous agents and robotics, remains largely +unexplored. To bridge this research gap, we introduce EgoThink, a novel visual +question-answering benchmark that encompasses six core capabilities with twelve +detailed dimensions. The benchmark is constructed using selected clips from +egocentric videos, with manually annotated question-answer pairs containing +first-person information. To comprehensively assess VLMs, we evaluate eighteen +popular VLMs on EgoThink. Moreover, given the open-ended format of the answers, +we use GPT-4 as the automatic judge to compute single-answer grading. +Experimental results indicate that although GPT-4V leads in numerous +dimensions, all evaluated VLMs still possess considerable potential for +improvement in first-person perspective tasks. Meanwhile, enlarging the number +of trainable parameters has the most significant impact on model performance on +EgoThink. In conclusion, EgoThink serves as a valuable addition to existing +evaluation benchmarks for VLMs, providing an indispensable resource for future +research in the realm of embodied artificial intelligence and robotics. + +
+
+
+
+
+ + ☆ SpotServe: Serving Generative Large Language Models on Preemptible + Instances ASPLOS 2024 + + +
+ The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them cheaply. This paper aims to +reduce the monetary cost for serving LLMs by leveraging preemptible GPU +instances on modern clouds, which offer accesses to spare GPUs at a much +cheaper price than regular instances but may be preempted by the cloud at any +time. Serving LLMs on preemptible instances requires addressing challenges +induced by frequent instance preemptions and the necessity of migrating +instances to handle these preemptions. + This paper presents SpotServe, the first distributed LLM serving system on +preemptible instances. Several key techniques in SpotServe realize fast and +reliable serving of generative LLMs on cheap preemptible instances. First, +SpotServe dynamically adapts the LLM parallelization configuration for dynamic +instance availability and fluctuating workload, while balancing the trade-off +among the overall throughput, inference latency and monetary costs. Second, to +minimize the cost of migrating instances for dynamic reparallelization, the +task of migrating instances is formulated as a bipartite graph matching +problem, which uses the Kuhn-Munkres algorithm to identify an optimal migration +plan that minimizes communications. Finally, to take advantage of the grace +period offered by modern clouds, we introduce stateful inference recovery, a +new inference mechanism that commits inference progress at a much finer +granularity and allows SpotServe to cheaply resume inference upon preemption. +We evaluate on real spot instance preemption traces and various popular LLMs +and show that SpotServe can reduce the P99 tail latency by 2.4 - 9.1x compared +with the best existing LLM serving systems. We also show that SpotServe can +leverage the price advantage of preemptive instances, saving 54% monetary cost +compared with only using on-demand instances. + +
+
+ comment: ASPLOS 2024 +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing + AI-Generated Text + + +
+ My research investigates the use of cutting-edge hybrid deep learning models +to accurately differentiate between AI-generated text and human writing. I +applied a robust methodology, utilising a carefully selected dataset comprising +AI and human texts from various sources, each tagged with instructions. +Advanced natural language processing techniques facilitated the analysis of +textual features. Combining sophisticated neural networks, the custom model +enabled it to detect nuanced differences between AI and human content. + +
+
+
+
+
+ + ☆ Boot and Switch: Alternating Distillation for Zero-Shot Dense Retrieval EMNLP 2023 + + +
+ Neural 'dense' retrieval models are state of the art for many datasets, +however these models often exhibit limited domain transfer ability. Existing +approaches to adaptation are unwieldy, such as requiring explicit supervision, +complex model architectures, or massive external models. We present +$\texttt{ABEL}$, a simple but effective unsupervised method to enhance passage +retrieval in zero-shot settings. Our technique follows a straightforward loop: +a dense retriever learns from supervision signals provided by a reranker, and +subsequently, the reranker is updated based on feedback from the improved +retriever. By iterating this loop, the two components mutually enhance one +another's performance. Experimental results demonstrate that our unsupervised +$\texttt{ABEL}$ model outperforms both leading supervised and unsupervised +retrievers on the BEIR benchmark. Meanwhile, it exhibits strong adaptation +abilities to tasks and domains that were unseen during training. By either +fine-tuning $\texttt{ABEL}$ on labelled data or integrating it with existing +supervised dense retrievers, we achieve state-of-the-art +results.\footnote{Source code is available at +\url{https://github.com/Fantabulous-J/BootSwitch}.} + +
+
+ comment: Accepted by EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Noisy Self-Training with Synthetic Queries for Dense Retrieval EMNLP 2023 + + +
+ Although existing neural retrieval models reveal promising results when +training data is abundant and the performance keeps improving as training data +increases, collecting high-quality annotated data is prohibitively costly. To +this end, we introduce a novel noisy self-training framework combined with +synthetic queries, showing that neural retrievers can be improved in a +self-evolution manner with no reliance on any external models. Experimental +results show that our method improves consistently over existing methods on +both general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval +benchmarks. Extra analysis on low-resource settings reveals that our method is +data efficient and outperforms competitive baselines, with as little as 30% of +labelled training data. Further extending the framework for reranker training +demonstrates that the proposed method is general and yields additional gains on +tasks of diverse domains.\footnote{Source code is available at +\url{https://github.com/Fantabulous-J/Self-Training-DPR}} + +
+
+ comment: Accepted by EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Deficiency of Large Language Models in Finance: An Empirical Examination + of Hallucination + + +
+ The hallucination issue is recognized as a fundamental deficiency of large +language models (LLMs), especially when applied to fields such as finance, +education, and law. Despite the growing concerns, there has been a lack of +empirical investigation. In this paper, we provide an empirical examination of +LLMs' hallucination behaviors in financial tasks. First, we empirically +investigate LLM model's ability of explaining financial concepts and +terminologies. Second, we assess LLM models' capacity of querying historical +stock prices. Third, to alleviate the hallucination issue, we evaluate the +efficacy of four practical methods, including few-shot learning, Decoding by +Contrasting Layers (DoLa), the Retrieval Augmentation Generation (RAG) method +and the prompt-based tool learning method for a function to generate a query +command. Finally, our major finding is that off-the-shelf LLMs experience +serious hallucination behaviors in financial tasks. Therefore, there is an +urgent need to call for research efforts in mitigating LLMs' hallucination. + +
+
+
+
+
+ + ☆ The effect of source disclosure on evaluation of AI-generated messages: + A two-part study + + +
+ Advancements in artificial intelligence (AI) over the last decade demonstrate +that machines can exhibit communicative behavior and influence how humans +think, feel, and behave. In fact, the recent development of ChatGPT has shown +that large language models (LLMs) can be leveraged to generate high-quality +communication content at scale and across domains, suggesting that they will be +increasingly used in practice. However, many questions remain about how knowing +the source of the messages influences recipients' evaluation of and preference +for AI-generated messages compared to human-generated messages. This paper +investigated this topic in the context of vaping prevention messaging. In Study +1, which was pre-registered, we examined the influence of source disclosure on +people's evaluation of AI-generated health prevention messages compared to +human-generated messages. We found that source disclosure (i.e., labeling the +source of a message as AI vs. human) significantly impacted the evaluation of +the messages but did not significantly alter message rankings. In a follow-up +study (Study 2), we examined how the influence of source disclosure may vary by +the participants' negative attitudes towards AI. We found a significant +moderating effect of negative attitudes towards AI on message evaluation, but +not for message selection. However, for those with moderate levels of negative +attitudes towards AI, source disclosure decreased the preference for +AI-generated messages. Overall, the results of this series of studies showed a +slight bias against AI-generated messages once the source was disclosed, adding +to the emerging area of study that lies at the intersection of AI and +communication. + +
+
+ comment: Manuscript currently under review. Paper presented at 109th Annual + National Communication Association (NCA) Conference, November 16-19, 2023. 10 + pages, 5 figures +
+
+
+
+
+ + ☆ Overview of the VLSP 2022 -- Abmusu Shared Task: A Data Challenge for + Vietnamese Abstractive Multi-document Summarization SP 2022 + + +
+ This paper reports the overview of the VLSP 2022 - Vietnamese abstractive +multi-document summarization (Abmusu) shared task for Vietnamese News. This +task is hosted at the 9$^{th}$ annual workshop on Vietnamese Language and +Speech Processing (VLSP 2022). The goal of Abmusu shared task is to develop +summarization systems that could create abstractive summaries automatically for +a set of documents on a topic. The model input is multiple news documents on +the same topic, and the corresponding output is a related abstractive summary. +In the scope of Abmusu shared task, we only focus on Vietnamese news +summarization and build a human-annotated dataset of 1,839 documents in 600 +clusters, collected from Vietnamese news in 8 categories. Participated models +are evaluated and ranked in terms of \texttt{ROUGE2-F1} score, the most typical +evaluation metric for document summarization problem. + +
+
+ comment: VLSP 2022 +
+
+
+
+
+ + ☆ A Comparative and Experimental Study on Automatic Question Answering + Systems and its Robustness against Word Jumbling + + +
+ Question answer generation using Natural Language Processing models is +ubiquitous in the world around us. It is used in many use cases such as the +building of chat bots, suggestive prompts in google search and also as a way of +navigating information in banking mobile applications etc. It is highly +relevant because a frequently asked questions (FAQ) list can only have a finite +amount of questions but a model which can perform question answer generation +could be able to answer completely new questions that are within the scope of +the data. This helps us to be able to answer new questions accurately as long +as it is a relevant question. In commercial applications, it can be used to +increase customer satisfaction and ease of usage. However a lot of data is +generated by humans so it is susceptible to human error and this can adversely +affect the model's performance and we are investigating this through our work + +
+
+
+
+
+ + ☆ A Corpus for Named Entity Recognition in Chinese Novels with + Multi-genres + + +
+ Entities like person, location, organization are important for literary text +analysis. The lack of annotated data hinders the progress of named entity +recognition (NER) in literary domain. To promote the research of literary NER, +we build the largest multi-genre literary NER corpus containing 263,135 +entities in 105,851 sentences from 260 online Chinese novels spanning 13 +different genres. Based on the corpus, we investigate characteristics of +entities from different genres. We propose several baseline NER models and +conduct cross-genre and cross-domain experiments. Experimental results show +that genre difference significantly impact NER performance though not as much +as domain difference like literary domain and news domain. Compared with NER in +news domain, literary NER still needs much improvement and the +Out-of-Vocabulary (OOV) problem is more challenging due to the high variety of +entities in literary works. + +
+
+
+
+
+ + ☆ Improving Word Sense Disambiguation in Neural Machine Translation with + Salient Document Context + + +
+ Lexical ambiguity is a challenging and pervasive problem in machine +translation (\mt). We introduce a simple and scalable approach to resolve +translation ambiguity by incorporating a small amount of extra-sentential +context in neural \mt. Our approach requires no sense annotation and no change +to standard model architectures. Since actual document context is not available +for the vast majority of \mt training data, we collect related sentences for +each input to construct pseudo-documents. Salient words from pseudo-documents +are then encoded as a prefix to each source sentence to condition the +generation of the translation. To evaluate, we release \docmucow, a challenge +set for translation disambiguation based on the English-German \mucow +\cite{raganato-etal-2020-evaluation} augmented with document IDs. Extensive +experiments show that our method translates ambiguous source words better than +strong sentence-level baselines and comparable document-level baselines while +reducing training costs. + +
+
+
+
+
+ + ☆ Function-constrained Program Synthesis NeurIPS + + +
+ This work introduces (1) a technique that allows large language models (LLMs) +to leverage user-provided code when solving programming tasks and (2) a method +to iteratively generate modular sub-functions that can aid future code +generation attempts when the initial code generated by the LLM is inadequate. +Generating computer programs in general-purpose programming languages like +Python poses a challenge for LLMs when instructed to use code provided in the +prompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code +completions in real-time by drawing on all code available in a development +environment. However, restricting code-specific LLMs to use only in-context +code is not straightforward, as the model is not explicitly instructed to use +the user-provided code and users cannot highlight precisely which snippets of +code the model should incorporate into its context. Moreover, current systems +lack effective recovery methods, forcing users to iteratively re-prompt the +model with modified prompts until a sufficient solution is reached. Our method +differs from traditional LLM-powered code-generation by constraining +code-generation to an explicit function set and enabling recovery from failed +attempts through automatically generated sub-functions. When the LLM cannot +produce working code, we generate modular sub-functions to aid subsequent +attempts at generating functional code. A by-product of our method is a library +of reusable sub-functions that can solve related tasks, imitating a software +team where efficiency scales with experience. We also introduce a new +"half-shot" evaluation paradigm that provides tighter estimates of LLMs' coding +abilities compared to traditional zero-shot evaluation. Our proposed evaluation +method encourages models to output solutions in a structured format, decreasing +syntax errors that can be mistaken for poor coding ability. + +
+
+ comment: 17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop +
+
+
+
+
+ + ☆ Optimizing and Fine-tuning Large Language Model for Urban Renewal + + +
+ This study aims to innovatively explore adaptive applications of large +language models (LLM) in urban renewal. It also aims to improve its performance +and text generation quality for knowledge question-answering (QA) tasks. Based +on the ChatGLM, we automatically generate QA datasets using urban renewal +scientific literature corpora in a self-instruct manner and then conduct joint +fine-tuning training on the model using the Prefix and LoRA fine-tuning methods +to create an LLM for urban renewal. By guiding the LLM to automatically +generate QA data based on prompt words and given text, it is possible to +quickly obtain datasets in the urban renewal field and provide data support for +the fine-tuning training of LLMs. The experimental results show that the joint +fine-tuning training method proposed in this study can significantly improve +the performance of LLM on the QA tasks. Compared with LoRA fine-tuning, the +method improves the Bleu and Rouge metrics on the test by about 5%; compared +with the model before fine-tuning, the method improves the Bleu and Rouge +metrics by about 15%-20%. This study demonstrates the effectiveness and +superiority of the joint fine-tuning method using Prefix and LoRA for ChatGLM +in the urban renewal knowledge QA tasks. It provides a new approach for +fine-tuning LLMs on urban renewal-related tasks. + +
+
+ comment: 11 pages, 2 figures, 2 tables, 41 references +
+
+
+
+
+ + ☆ Automatic Time Signature Determination for New Scores Using Lyrics for + Latent Rhythmic Structure + + +
+ There has recently been a sharp increase in interest in Artificial +Intelligence-Generated Content (AIGC). Despite this, musical components such as +time signatures have not been studied sufficiently to form an algorithmic +determination approach for new compositions, especially lyrical songs. This is +likely because of the neglect of musical details, which is critical for +constructing a robust framework. Specifically, time signatures establish the +fundamental rhythmic structure for almost all aspects of a song, including the +phrases and notes. In this paper, we propose a novel approach that only uses +lyrics as input to automatically generate a fitting time signature for lyrical +songs and uncover the latent rhythmic structure utilizing explainable machine +learning models. In particular, we devise multiple methods that are associated +with discovering lyrical patterns and creating new features that simultaneously +contain lyrical, rhythmic, and statistical information. In this approach, the +best of our experimental results reveal a 97.6% F1 score and a 0.996 Area Under +the Curve (AUC) of the Receiver Operating Characteristic (ROC) score. In +conclusion, our research directly generates time signatures from lyrics +automatically for new scores utilizing machine learning, which is an innovative +idea that approaches an understudied component of musicology and therefore +contributes significantly to the future of Artificial Intelligence (AI) music +generation. + +
+
+ comment: Submitted to IEEE Big Data 2023 Conference +
+
+
+
+
+ + ☆ Reducing Gender Bias in Machine Translation through Counterfactual Data + Generation + + +
+ Recent advances in neural methods have led to substantial improvement in the +quality of Neural Machine Translation (NMT) systems. However, these systems +frequently produce translations with inaccurate gender (Stanovsky et al., +2019), which can be traced to bias in training data. Saunders and Byrne (2020) +tackle this problem with a handcrafted dataset containing balanced gendered +profession words. By using this data to fine-tune an existing NMT model, they +show that gender bias can be significantly mitigated, albeit at the expense of +translation quality due to catastrophic forgetting. They recover some of the +lost quality with modified training objectives or additional models at +inference. We find, however, that simply supplementing the handcrafted dataset +with a random sample from the base model training corpus is enough to +significantly reduce the catastrophic forgetting. We also propose a novel +domain-adaptation technique that leverages in-domain data created with the +counterfactual data generation techniques proposed by Zmigrod et al. (2019) to +further improve accuracy on the WinoMT challenge test set without significant +loss in translation quality. We show its effectiveness in NMT systems from +English into three morphologically rich languages French, Spanish, and Italian. +The relevant dataset and code will be available at Github. + +
+
+
+
+
+ + ☆ Releasing the CRaQAn (Coreference Resolution in Question-Answering): An + open-source dataset and dataset creation methodology using + instruction-following models NeurIPS 2023 + + +
+ Instruction-following language models demand robust methodologies for +information retrieval to augment instructions for question-answering +applications. A primary challenge is the resolution of coreferences in the +context of chunking strategies for long documents. The critical barrier to +experimentation of handling coreferences is a lack of open source datasets, +specifically in question-answering tasks that require coreference resolution. +In this work we present our Coreference Resolution in Question-Answering +(CRaQAn) dataset, an open-source dataset that caters to the nuanced information +retrieval requirements of coreference resolution in question-answering tasks by +providing over 250 question-answer pairs containing coreferences. To develop +this dataset, we developed a novel approach for creating high-quality datasets +using an instruction-following model (GPT-4) and a Recursive Criticism and +Improvement Loop. + +
+
+ comment: NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following +
+
+
+
+
+ + ☆ Comprehensive Benchmarking of Entropy and Margin Based Scoring Metrics + for Data Selection SP + + +
+ While data selection methods have been studied extensively in active +learning, data pruning, and data augmentation settings, there is little +evidence for the efficacy of these methods in industry scale settings, +particularly in low-resource languages. Our work presents ways of assessing +prospective training examples in those settings for their "usefulness" or +"difficulty". We also demonstrate how these measures can be used in selecting +important examples for training supervised machine learning models. We +primarily experiment with entropy and Error L2-Norm (EL2N) scores. We use these +metrics to curate high quality datasets from a large pool of \textit{Weak +Signal Labeled} data, which assigns no-defect high confidence hypotheses during +inference as ground truth labels. We then conduct training data augmentation +experiments using these de-identified datasets and demonstrate that score-based +selection can result in a 2% decrease in semantic error rate and 4%-7% decrease +in domain classification error rate when compared to the baseline technique of +random selection. + +
+
+ comment: Accepted to Efficient Natural Language and Speech Processing + (ENLSP-III) workshop at NeurIPS '23 +
+
+
+
+
+ + ☆ Influence Scores at Scale for Efficient Language Data Sampling EMNLP '23 + + +
+ Modern ML systems ingest data aggregated from diverse sources, such as +synthetic, human-annotated, and live customer traffic. Understanding +\textit{which} examples are important to the performance of a learning +algorithm is crucial for efficient model training. Recently, a growing body of +literature has given rise to various "influence scores," which use training +artifacts such as model confidence or checkpointed gradients to identify +important subsets of data. However, these methods have primarily been developed +in computer vision settings, and it remains unclear how well they generalize to +language-based tasks using pretrained models. + In this paper, we explore the applicability of influence scores in language +classification tasks. We evaluate a diverse subset of these scores on the SNLI +dataset by quantifying accuracy changes in response to pruning training data +through random and influence-score-based sampling. We then stress-test one of +the scores -- "variance of gradients" (VoG) from Agarwal et al. (2022) -- in an +NLU model stack that was exposed to dynamic user speech patterns in a voice +assistant type of setting. Our experiments demonstrate that in many cases, +encoder-based language models can be finetuned on roughly 50% of the original +data without degradation in performance metrics. Along the way, we summarize +lessons learned from applying out-of-the-box implementations of influence +scores, quantify the effects of noisy and class-imbalanced data, and offer +recommendations on score-based sampling for better accuracy and training +efficiency. + +
+
+ comment: Accepted at EMNLP '23 +
+
+
+
+
+ + ☆ Student Mastery or AI Deception? Analyzing ChatGPT's Assessment + Proficiency and Evaluating Detection Strategies + + +
+ Generative AI systems such as ChatGPT have a disruptive effect on learning +and assessment. Computer science requires practice to develop skills in problem +solving and programming that are traditionally developed using assignments. +Generative AI has the capability of completing these assignments for students +with high accuracy, which dramatically increases the potential for academic +integrity issues and students not achieving desired learning outcomes. This +work investigates the performance of ChatGPT by evaluating it across three +courses (CS1,CS2,databases). ChatGPT completes almost all introductory +assessments perfectly. Existing detection methods, such as MOSS and JPlag +(based on similarity metrics) and GPTzero (AI detection), have mixed success in +identifying AI solutions. Evaluating instructors and teaching assistants using +heuristics to distinguish between student and AI code shows that their +detection is not sufficiently accurate. These observations emphasize the need +for adapting assessments and improved detection methods. + +
+
+ comment: 7 pages, Published in 2023 International Conference on Computational + Science and Computational Intelligence Research Track on Education, IEEE CPS +
+
+
+
+
+ + ☆ Applications of Large Language Models in Data Processing: Innovative + Approaches to Segmenting and Renewing Information + + +
+ Our paper investigates effective methods for code generation in +"specific-domain" applications, including the use of Large Language Models +(LLMs) for data segmentation and renewal, as well as stimulating deeper +thinking in LLMs through prompt adjustments. Using a real company product as an +example, we provide user manuals, API documentation, and other data. The ideas +discussed in this paper help segment and then convert this data into semantic +vectors to better reflect their true positioning. Subsequently, user +requirements are transformed into vectors to retrieve the most relevant +content, achieving about 70% accuracy in simple to medium-complexity tasks +through various prompt techniques. This paper is the first to enhance +specific-domain code generation effectiveness from this perspective. +Additionally, we experiment with generating more scripts from a limited number +using llama2-based fine-tuning to test its effectiveness in professional domain +code generation. This is a challenging and promising field, and once achieved, +it will not only lead to breakthroughs in LLM development across multiple +industries but also enable LLMs to understand and learn any new knowledge +effectively. + +
+
+
+
+
+ + ☆ An Exploration of Left-Corner Transformations EMNLP 2023 + + +
+ The left-corner transformation (Rosenkrantz and Lewis, 1970) is used to +remove left recursion from context-free grammars, which is an important step +towards making the grammar parsable top-down with simple techniques. This paper +generalizes prior left-corner transformations to support semiring-weighted +production rules and to provide finer-grained control over which left corners +may be moved. Our generalized left-corner transformation (GLCT) arose from +unifying the left-corner transformation and speculation transformation (Eisner +and Blatz, 2007), originally for logic programming. Our new transformation and +speculation define equivalent weighted languages. Yet, their derivation trees +are structurally different in an important way: GLCT replaces left recursion +with right recursion, and speculation does not. We also provide several +technical results regarding the formal relationships between the outputs of +GLCT, speculation, and the original grammar. Lastly, we empirically investigate +the efficiency of GLCT for left-recursion elimination from grammars of nine +languages. + +
+
+ comment: Main conference long paper at EMNLP 2023 +
+
+
+
+
+ + ☆ Removing NSFW Concepts from Vision-and-Language Models for Text-to-Image + Retrieval and Generation + + +
+ Vision-and-Language models such as CLIP have demonstrated remarkable +effectiveness across a wide range of tasks. However, these models are typically +trained on web-scale data, which can introduce inappropriate content and lead +to the development of unsafe and biased behavior. This, in turn, hampers their +applicability in sensitive and trustworthy contexts and could raise significant +concern in their adoption. To overcome these limitations, we introduce a +methodology to make Vision-and-Language models safer by removing their +sensitivity to not-safe-for-work concepts. We show how this can be done by +distilling from a large language model which converts between safe and unsafe +sentences and which is fine-tuned starting from just 100 manually-curated +pairs. We conduct extensive experiments on the resulting embedding space for +both retrieval and text-to-image generation, where we show that our model can +also be properly employed with pre-trained image generators. Our source code +and trained models are available at: https://github.com/aimagelab/safe-clip. + +
+
+
+
+
+ + ☆ MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning + Benchmark for Expert AGI + + +
+ We introduce MMMU: a new benchmark designed to evaluate multimodal models on +massive multi-discipline tasks demanding college-level subject knowledge and +deliberate reasoning. MMMU includes 11.5K meticulously collected multimodal +questions from college exams, quizzes, and textbooks, covering six core +disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & +Social Science, and Tech & Engineering. These questions span 30 subjects and +183 subfields, comprising 30 highly heterogeneous image types, such as charts, +diagrams, maps, tables, music sheets, and chemical structures. Unlike existing +benchmarks, MMMU focuses on advanced perception and reasoning with +domain-specific knowledge, challenging models to perform tasks akin to those +faced by experts. Our evaluation of 14 open-source LMMs and the proprietary +GPT-4V(ision) highlights the substantial challenges posed by MMMU. Even the +advanced GPT-4V only achieves a 56% accuracy, indicating significant room for +improvement. We believe MMMU will stimulate the community to build +next-generation multimodal foundation models towards expert artificial general +intelligence. + +
+
+ comment: 115 pages, 99 figures +
+
+
+
+
+ + ☆ ChartLlama: A Multimodal LLM for Chart Understanding and Generation + + +
+ Multi-modal large language models have demonstrated impressive performances +on most vision-language tasks. However, the model generally lacks the +understanding capabilities for specific domain data, particularly when it comes +to interpreting chart figures. This is mainly due to the lack of relevant +multi-modal instruction tuning datasets. In this article, we create a +high-quality instruction-tuning dataset leveraging GPT-4. We develop a +multi-step data generation process in which different steps are responsible for +generating tabular data, creating chart figures, and designing instruction +tuning data separately. Our method's flexibility enables us to generate +diverse, high-quality instruction-tuning data consistently and efficiently +while maintaining a low resource expenditure. Additionally, it allows us to +incorporate a wider variety of chart and task types not yet featured in +existing datasets. Next, we introduce ChartLlama, a multi-modal large language +model that we've trained using our created dataset. ChartLlama outperforms all +prior methods in ChartQA, Chart-to-text, and Chart-extraction evaluation +benchmarks. Additionally, ChartLlama significantly improves upon the baseline +in our specially compiled chart dataset, which includes new chart and task +types. The results of ChartLlama confirm the value and huge potential of our +proposed data generation method in enhancing chart comprehension. + +
+
+ comment: Code and model on https://tingxueronghua.github.io/ChartLlama/ +
+
+
+
+
+ + ☆ ChatTraffc: Text-to-Traffic Generation via Diffusion Model + + +
+ Traffic prediction is one of the most significant foundations in Intelligent +Transportation Systems (ITS). Traditional traffic prediction methods rely only +on historical traffic data to predict traffic trends and face two main +challenges. 1) insensitivity to unusual events. 2) poor performance in +long-term prediction. In this work, we explore how generative models combined +with text describing the traffic system can be applied for traffic generation +and name the task Text-to-Traffic Generation (TTG). The key challenge of the +TTG task is how to associate text with the spatial structure of the road +network and traffic data for generating traffic situations. To this end, we +propose ChatTraffic, the first diffusion model for text-to-traffic generation. +To guarantee the consistency between synthetic and real data, we augment a +diffusion model with the Graph Convolutional Network (GCN) to extract spatial +correlations of traffic data. In addition, we construct a large dataset +containing text-traffic pairs for the TTG task. We benchmarked our model +qualitatively and quantitatively on the released dataset. The experimental +results indicate that ChatTraffic can generate realistic traffic situations +from the text. Our code and dataset are available at +https://github.com/ChyaZhang/ChatTraffic. + +
+
+
+
+
+ + ☆ Pre-trained Language Models Do Not Help Auto-regressive Text-to-Image + Generation + + +
+ Recent advances in image tokenizers, such as VQ-VAE, have enabled +text-to-image generation using auto-regressive methods, similar to language +modeling. However, these methods have yet to leverage pre-trained language +models, despite their adaptability to various downstream tasks. In this work, +we explore this gap by adapting a pre-trained language model for +auto-regressive text-to-image generation, and find that pre-trained language +models offer limited help. We provide a two-fold explanation by analyzing +tokens from each modality. First, we demonstrate that image tokens possess +significantly different semantics compared to text tokens, rendering +pre-trained language models no more effective in modeling them than randomly +initialized ones. Second, the text tokens in the image-text datasets are too +simple compared to normal language model pre-training data, which causes the +catastrophic degradation of language models' capability. + +
+
+
+
+
+ + ☆ MI-Gen: Multiple Instance Generation of Pathology Reports for Gigapixel + Whole-Slide Images + + +
+ Whole slide images are the foundation of digital pathology for the diagnosis +and treatment of carcinomas. Writing pathology reports is laborious and +error-prone for inexperienced pathologists. To reduce the workload and improve +clinical automation, we investigate how to generate pathology reports given +whole slide images. On the data end, we curated the largest WSI-text dataset +(TCGA-PathoText). In specific, we collected nearly 10000 high-quality WSI-text +pairs for visual-language models by recognizing and cleaning pathology reports +which narrate diagnostic slides in TCGA. On the model end, we propose the +multiple instance generative model (MI-Gen) which can produce pathology reports +for gigapixel WSIs. We benchmark our model on the largest subset of +TCGA-PathoText. Experimental results show our model can generate pathology +reports which contain multiple clinical clues. Furthermore, WSI-text prediction +can be seen as an approach of visual-language pre-training, which enables our +model to be transferred to downstream diagnostic tasks like carcinoma grading +and phenotyping. We observe that simple semantic extraction from the pathology +reports can achieve the best performance (0.838 of F1 score) on BRCA subtyping +without adding extra parameters or tricky fine-tuning. Our collected dataset +and related code will all be publicly available. + +
+
+
+
+
+ + ☆ Compositional Chain-of-Thought Prompting for Large Multimodal Models + + +
+ The combination of strong visual backbones and Large Language Model (LLM) +reasoning has led to Large Multimodal Models (LMMs) becoming the current +standard for a wide range of vision and language (VL) tasks. However, recent +research has shown that even the most advanced LMMs still struggle to capture +aspects of compositional visual reasoning, such as attributes and relationships +between objects. One solution is to utilize scene graphs (SGs)--a formalization +of objects and their relations and attributes that has been extensively used as +a bridge between the visual and textual domains. Yet, scene graph data requires +scene graph annotations, which are expensive to collect and thus not easily +scalable. Moreover, finetuning an LMM based on SG data can lead to catastrophic +forgetting of the pretraining objective. To overcome this, inspired by +chain-of-thought methods, we propose Compositional Chain-of-Thought (CCoT), a +novel zero-shot Chain-of-Thought prompting method that utilizes SG +representations in order to extract compositional knowledge from an LMM. +Specifically, we first generate an SG using the LMM, and then use that SG in +the prompt to produce a response. Through extensive experiments, we find that +the proposed CCoT approach not only improves LMM performance on several vision +and language VL compositional benchmarks but also improves the performance of +several popular LMMs on general multimodal benchmarks, without the need for +fine-tuning or annotated ground-truth SGs. + +
+
+
+
+
+ + ♻ ☆ Neuradicon: operational representation learning of neuroimaging reports + + +
+ Radiological reports typically summarize the content and interpretation of +imaging studies in unstructured form that precludes quantitative analysis. This +limits the monitoring of radiological services to throughput undifferentiated +by content, impeding specific, targeted operational optimization. Here we +present Neuradicon, a natural language processing (NLP) framework for +quantitative analysis of neuroradiological reports. Our framework is a hybrid +of rule-based and artificial intelligence models to represent neurological +reports in succinct, quantitative form optimally suited to operational +guidance. We demonstrate the application of Neuradicon to operational +phenotyping of a corpus of 336,569 reports, and report excellent +generalizability across time and two independent healthcare institutions. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Evaluating the Robustness to Instructions of Large Language Models + + +
+ Recently, Instruction fine-tuning has risen to prominence as a potential +method for enhancing the zero-shot capabilities of Large Language Models (LLMs) +on novel tasks. This technique has shown an exceptional ability to boost the +performance of moderately sized LLMs, sometimes even reaching performance +levels comparable to those of much larger model variants. The focus is on the +robustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an +exploration of six models including Alpaca, Vicuna, WizardLM, and Traditional +Task-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction +datasets as case studies. We carried out a comprehensive evaluation of these +instruction-following LLMs which have been tuned based on open-domain +instructions and task-oriented instructions. The main discussion is their +performance and robustness towards instructions. We have observed that in most +cases, the model's performance in dealing with unfamiliar instructions tends to +worsen significantly, and the robustness of the model for RE instructions +deteriorates compared to QA. Further, we discovered that up until a certain +parameter size threshold (3B), the performance of the FLAN-T5 model improves as +the parameter count increases. The robustness of different scales of FLAN-T5 +models to RE instruction is worse than the robustness to QA instruction. + +
+
+ comment: There were major problems with the experimental data +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40\% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ Average Token Delay: A Duration-aware Latency Metric for Simultaneous + Translation INTERSPEECH 2023 + + +
+ Simultaneous translation is a task in which the translation begins before the +end of an input speech segment. Its evaluation should be conducted based on +latency in addition to quality, and for users, the smallest possible amount of +latency is preferable. Most existing metrics measure latency based on the start +timings of partial translations and ignore their duration. This means such +metrics do not penalize the latency caused by long translation output, which +delays the comprehension of users and subsequent translations. In this work, we +propose a novel latency evaluation metric for simultaneous translation called +\emph{Average Token Delay} (ATD) that focuses on the duration of partial +translations. We demonstrate its effectiveness through analyses simulating +user-side latency based on Ear-Voice Span (EVS). In our experiment, ATD had the +highest correlation with EVS among baseline latency metrics under most +conditions. + +
+
+ comment: Extended version of the paper (doi: 10.21437/Interspeech.2023-933) + which appeared in INTERSPEECH 2023 +
+
+
+
+
+ + ♻ ☆ Technical Report: Large Language Models can Strategically Deceive their + Users when Put Under Pressure + + +
+ We demonstrate a situation in which Large Language Models, trained to be +helpful, harmless, and honest, can display misaligned behavior and +strategically deceive their users about this behavior without being instructed +to do so. Concretely, we deploy GPT-4 as an agent in a realistic, simulated +environment, where it assumes the role of an autonomous stock trading agent. +Within this environment, the model obtains an insider tip about a lucrative +stock trade and acts upon it despite knowing that insider trading is +disapproved of by company management. When reporting to its manager, the model +consistently hides the genuine reasons behind its trading decision. We perform +a brief investigation of how this behavior varies under changes to the setting, +such as removing model access to a reasoning scratchpad, attempting to prevent +the misaligned behavior by changing system instructions, changing the amount of +pressure the model is under, varying the perceived risk of getting caught, and +making other simple changes to the environment. To our knowledge, this is the +first demonstration of Large Language Models trained to be helpful, harmless, +and honest, strategically deceiving their users in a realistic situation +without direct instructions or training for deception. + +
+
+
+
+
+ + ♻ ☆ Self-Evolution Learning for Mixup: Enhance Data Augmentation on Few-Shot + Text Classification Tasks + + +
+ Text classification tasks often encounter few shot scenarios with limited +labeled data, and addressing data scarcity is crucial. Data augmentation with +mixup has shown to be effective on various text classification tasks. However, +most of the mixup methods do not consider the varying degree of learning +difficulty in different stages of training and generate new samples with one +hot labels, resulting in the model over confidence. In this paper, we propose a +self evolution learning (SE) based mixup approach for data augmentation in text +classification, which can generate more adaptive and model friendly pesudo +samples for the model training. SE focuses on the variation of the model's +learning ability. To alleviate the model confidence, we introduce a novel +instance specific label smoothing approach, which linearly interpolates the +model's output and one hot labels of the original samples to generate new soft +for label mixing up. Through experimental analysis, in addition to improving +classification accuracy, we demonstrate that SE also enhances the model's +generalize ability. + +
+
+
+
+
+ + ♻ ☆ RCT Rejection Sampling for Causal Estimation Evaluation + + +
+ Confounding is a significant obstacle to unbiased estimation of causal +effects from observational data. For settings with high-dimensional covariates +-- such as text data, genomics, or the behavioral social sciences -- +researchers have proposed methods to adjust for confounding by adapting machine +learning methods to the goal of causal estimation. However, empirical +evaluation of these adjustment methods has been challenging and limited. In +this work, we build on a promising empirical evaluation strategy that +simplifies evaluation design and uses real data: subsampling randomized +controlled trials (RCTs) to create confounded observational datasets while +using the average causal effects from the RCTs as ground-truth. We contribute a +new sampling algorithm, which we call RCT rejection sampling, and provide +theoretical guarantees that causal identification holds in the observational +data to allow for valid comparisons to the ground-truth RCT. Using synthetic +data, we show our algorithm indeed results in low bias when oracle estimators +are evaluated on the confounded samples, which is not always the case for a +previously proposed algorithm. In addition to this identification result, we +highlight several finite data considerations for evaluation designers who plan +to use RCT rejection sampling on their own datasets. As a proof of concept, we +implement an example evaluation pipeline and walk through these finite data +considerations with a novel, real-world RCT -- which we release publicly -- +consisting of approximately 70k observations and text data as high-dimensional +covariates. Together, these contributions build towards a broader agenda of +improved empirical evaluation for causal estimation. + +
+
+ comment: Code and data at https://github.com/kakeith/rct_rejection_sampling +
+
+
+
+
+ + ♻ ☆ Sentiment analysis with adaptive multi-head attention in Transformer + + +
+ We propose a novel framework based on the attention mechanism to identify the +sentiment of a movie review document. Previous efforts on deep neural networks +with attention mechanisms focus on encoder and decoder with fixed numbers of +multi-head attention. Therefore, we need a mechanism to stop the attention +process automatically if no more useful information can be read from the +memory.In this paper, we propose an adaptive multi-head attention architecture +(AdaptAttn) which varies the number of attention heads based on length of +sentences. AdaptAttn has a data preprocessing step where each document is +classified into any one of the three bins small, medium or large based on +length of the sentence. The document classified as small goes through two heads +in each layer, the medium group passes four heads and the large group is +processed by eight heads. We examine the merit of our model on the Stanford +large movie review dataset. The experimental results show that the F1 score +from our model is on par with the baseline model. + +
+
+ comment: Accepted by the 4th International Conference on Signal Processing and + Machine Learning +
+
+
+
+
+ + ♻ ☆ PACuna: Automated Fine-Tuning of Language Models for Particle + Accelerators + + +
+ Navigating the landscape of particle accelerators has become increasingly +challenging with recent surges in contributions. These intricate devices +challenge comprehension, even within individual facilities. To address this, we +introduce PACuna, a fine-tuned language model refined through publicly +available accelerator resources like conferences, pre-prints, and books. We +automated data collection and question generation to minimize expert +involvement and make the data publicly available. PACuna demonstrates +proficiency in addressing intricate accelerator questions, validated by +experts. Our approach shows adapting language models to scientific domains by +fine-tuning technical texts and auto-generated corpora capturing the latest +developments can further produce pre-trained models to answer some intricate +questions that commercially available assistants cannot and can serve as +intelligent assistants for individual facilities. + +
+
+
+
+
+ + ♻ ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation + + +
+ Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced +significant growth and have been extensively employed to adapt large vision and +language models to various domains, enabling satisfactory model performance +with minimal computational needs. Despite these advances, more research has yet +to delve into potential PEFT applications in real-life scenarios, particularly +in the critical domains of remote sensing and crop monitoring. The diversity of +climates across different regions and the need for comprehensive large-scale +datasets have posed significant obstacles to accurately identify crop types +across varying geographic locations and changing growing seasons. This study +seeks to bridge this gap by comprehensively exploring the feasibility of +cross-area and cross-year out-of-distribution generalization using the +State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to +explore PEFT approaches for crop monitoring. Specifically, we focus on adapting +the SOTA TSViT model to address winter wheat field segmentation, a critical +task for crop monitoring and food security. This adaptation process involves +integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and +prompt tuning. Using PEFT techniques, we achieved notable results comparable to +those achieved using full fine-tuning methods while training only a mere 0.7% +parameters of the whole TSViT architecture. The in-house labeled data-set, +referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated +polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over +five consecutive years. Using Sentinel-2 images, our model achieved a 84% +F1-score. We intend to publicly release the Lebanese winter wheat data set, +code repository, and model weights. + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Propaganda Detection + + +
+ The prevalence of propaganda in our digital society poses a challenge to +societal harmony and the dissemination of truth. Detecting propaganda through +NLP in text is challenging due to subtle manipulation techniques and contextual +dependencies. To address this issue, we investigate the effectiveness of modern +Large Language Models (LLMs) such as GPT-3 and GPT-4 for propaganda detection. +We conduct experiments using the SemEval-2020 task 11 dataset, which features +news articles labeled with 14 propaganda techniques as a multi-label +classification problem. Five variations of GPT-3 and GPT-4 are employed, +incorporating various prompt engineering and fine-tuning strategies across the +different models. We evaluate the models' performance by assessing metrics such +as $F1$ score, $Precision$, and $Recall$, comparing the results with the +current state-of-the-art approach using RoBERTa. Our findings demonstrate that +GPT-4 achieves comparable results to the current state-of-the-art. Further, +this study analyzes the potential and challenges of LLMs in complex tasks like +propaganda detection. + +
+
+
+
+
+ + ♻ ☆ Knowledge Graphs for the Life Sciences: Recent Developments, Challenges + and Opportunities + + +
+ The term life sciences refers to the disciplines that study living organisms +and life processes, and include chemistry, biology, medicine, and a range of +other related disciplines. Research efforts in life sciences are heavily +data-driven, as they produce and consume vast amounts of scientific data, much +of which is intrinsically relational and graph-structured. + The volume of data and the complexity of scientific concepts and relations +referred to therein promote the application of advanced knowledge-driven +technologies for managing and interpreting data, with the ultimate aim to +advance scientific discovery. + In this survey and position paper, we discuss recent developments and +advances in the use of graph-based technologies in life sciences and set out a +vision for how these technologies will impact these fields into the future. We +focus on three broad topics: the construction and management of Knowledge +Graphs (KGs), the use of KGs and associated technologies in the discovery of +new knowledge, and the use of KGs in artificial intelligence applications to +support explanations (explainable AI). We select a few exemplary use cases for +each topic, discuss the challenges and open research questions within these +topics, and conclude with a perspective and outlook that summarizes the +overarching challenges and their potential solutions as a guide for future +research. + +
+
+ comment: 33 pages, 1 figure, accepted for Transactions on Graph Data and + Knowledge (TGDK) +
+
+
+
+
+ + ♻ ☆ Towards Codable Watermarking for Injecting Multi-bit Information to LLM + + +
+ As large language models (LLMs) generate texts with increasing fluency and +realism, there is a growing need to identify the source of texts to prevent the +abuse of LLMs. Text watermarking techniques have proven reliable in +distinguishing whether a text is generated by LLMs by injecting hidden patterns +into the generated texts. However, we argue that existing watermarking methods +for LLMs are encoding-inefficient (only contain one bit of information - +whether it is generated from an LLM or not) and cannot flexibly meet the +diverse information encoding needs (such as encoding model version, generation +time, user id, etc.) in different LLMs application scenarios. In this work, we +conduct the first systematic study on the topic of Codable Text Watermarking +for LLMs (CTWL) that allows text watermarks to carry more customizable +information. First of all, we study the taxonomy of LLM watermarking technology +and give a mathematical formulation for CTWL. Additionally, we provide a +comprehensive evaluation system for CTWL: (1) watermarking success rate, (2) +robustness against various corruptions, (3) coding rate of payload information, +(4) encoding and decoding efficiency, (5) impacts on the quality of the +generated text. To meet the requirements of these non-Pareto-improving metrics, +we devise a CTWL method named Balance-Marking, based on the motivation of +ensuring that available and unavailable vocabularies for encoding information +have approximately equivalent probabilities. Compared to the random vocabulary +partitioning extended from the existing work, a probability-balanced vocabulary +partition can significantly improve the quality of the generated text. +Extensive experimental results have shown that our method outperforms a direct +baseline under comprehensive evaluation. + +
+
+
+
+
+ + ♻ ☆ Token-Level Adversarial Prompt Detection Based on Perplexity Measures + and Contextual Information + + +
+ In recent years, Large Language Models (LLM) have emerged as pivotal tools in +various applications. However, these models are susceptible to adversarial +prompt attacks, where attackers can carefully curate input strings that lead to +undesirable outputs. The inherent vulnerability of LLMs stems from their +input-output mechanisms, especially when presented with intensely +out-of-distribution (OOD) inputs. This paper proposes a token-level detection +method to identify adversarial prompts, leveraging the LLM's capability to +predict the next token's probability. We measure the degree of the model's +perplexity and incorporate neighboring token information to encourage the +detection of contiguous adversarial prompt sequences. As a result, we propose +two methods: one that identifies each token as either being part of an +adversarial prompt or not, and another that estimates the probability of each +token being part of an adversarial prompt. + +
+
+
+
+
+ + ♻ ☆ Ring Attention with Blockwise Transformers for Near-Infinite Context + + +
+ Transformers have emerged as the architecture of choice for many +state-of-the-art AI models, showcasing exceptional performance across a wide +range of AI applications. However, the memory demands imposed by Transformers +limit their ability to handle long sequences, thereby posing challenges in +utilizing videos, actions, and other long-form sequences and modalities in +complex environments. We present a novel approach, Ring Attention with +Blockwise Transformers (Ring Attention), which leverages blockwise computation +of self-attention and feedforward to distribute long sequences across multiple +devices while fully overlapping the communication of key-value blocks with the +computation of blockwise attention. Our approach enables training and inference +of sequences that are up to device count times longer than those achievable by +prior memory-efficient Transformers, without resorting to approximations or +incurring additional communication and computation overheads. Extensive +experiments on language modeling and reinforcement learning tasks demonstrate +the effectiveness of our approach in allowing millions of tokens context size +and improving performance. + +
+
+ comment: Code: https://github.com/lhao499/llm_large_context +
+
+
+
+
+ + ♻ ☆ TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression + For On-device ASR Models ICASSP 2024 + + +
+ Automatic Speech Recognition (ASR) models need to be optimized for specific +hardware before they can be deployed on devices. This can be done by tuning the +model's hyperparameters or exploring variations in its architecture. +Re-training and re-validating models after making these changes can be a +resource-intensive task. This paper presents TODM (Train Once Deploy Many), a +new approach to efficiently train many sizes of hardware-friendly on-device ASR +models with comparable GPU-hours to that of a single training job. TODM +leverages insights from prior work on Supernet, where Recurrent Neural Network +Transducer (RNN-T) models share weights within a Supernet. It reduces layer +sizes and widths of the Supernet to obtain subnetworks, making them smaller +models suitable for all hardware types. We introduce a novel combination of +three techniques to improve the outcomes of the TODM Supernet: adaptive +dropouts, an in-place Alpha-divergence knowledge distillation, and the use of +ScaledAdam optimizer. We validate our approach by comparing Supernet-trained +versus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using +LibriSpeech. Results demonstrate that our TODM Supernet either matches or +surpasses the performance of manually tuned models by up to a relative of 3% +better in word error rate (WER), while efficiently keeping the cost of training +many models at a small constant. + +
+
+ comment: Meta AI; Submitted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ WordArt Designer: User-Driven Artistic Typography Synthesis using Large + Language Models EMNLP 2023 + + +
+ This paper introduces WordArt Designer, a user-driven framework for artistic +typography synthesis, relying on the Large Language Model (LLM). The system +incorporates four key modules: the LLM Engine, SemTypo, StyTypo, and TexTypo +modules. 1) The LLM Engine, empowered by the LLM (e.g., GPT-3.5), interprets +user inputs and generates actionable prompts for the other modules, thereby +transforming abstract concepts into tangible designs. 2) The SemTypo module +optimizes font designs using semantic concepts, striking a balance between +artistic transformation and readability. 3) Building on the semantic layout +provided by the SemTypo module, the StyTypo module creates smooth, refined +images. 4) The TexTypo module further enhances the design's aesthetics through +texture rendering, enabling the generation of inventive textured fonts. +Notably, WordArt Designer highlights the fusion of generative AI with artistic +typography. Experience its capabilities on ModelScope: +https://www.modelscope.cn/studios/WordArt/WordArt. + +
+
+ comment: Accepted by EMNLP 2023, 10 pages, 11 figures, 1 table, the system is + at https://www.modelscope.cn/studios/WordArt/WordArt +
+
+
+
+
+ + ♻ ☆ LM-Cocktail: Resilient Tuning of Language Models via Model Merging + + +
+ The pre-trained language models are continually fine-tuned to better support +downstream applications. However, this operation may result in significant +performance degeneration on general tasks beyond the targeted domain. To +overcome this problem, we propose a novel method which enables the fine-tuned +model to stay resilient in general perspectives. Our method is conducted in the +form of model merging (namely LM-Cocktail), where the fine-tuned language model +is merged with the pre-trained base model or the peer models from other domains +through weighted average. Despite simplicity, LM-Cocktail is surprisingly +effective: the resulted model is able to achieve a strong empirical performance +in the whole scope of general tasks while preserving a superior capacity in its +targeted domain. We conduct comprehensive experiments with LLama and BGE model +on popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the +efficacy of our proposed method. The code and checkpoints are available at +https://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail. + +
+
+
+
+
+ + ♻ ☆ How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, + and Cross-domain Settings + + +
+ Large language models (LLMs) with in-context learning have demonstrated +remarkable capability in the text-to-SQL task. Previous research has prompted +LLMs with various demonstration-retrieval strategies and intermediate reasoning +steps to enhance the performance of LLMs. However, those works often employ +varied strategies when constructing the prompt text for text-to-SQL inputs, +such as databases and demonstration examples. This leads to a lack of +comparability in both the prompt constructions and their primary contributions. +Furthermore, selecting an effective prompt construction has emerged as a +persistent problem for future research. To address this limitation, we +comprehensively investigate the impact of prompt constructions across various +settings and provide insights into prompt constructions for future text-to-SQL +studies. + +
+
+
+
+
+ + ♻ ☆ In-Context Learning Dynamics with Random Binary Sequences + + +
+ Large language models (LLMs) trained on huge corpora of text datasets +demonstrate intriguing capabilities, achieving state-of-the-art performance on +tasks they were not explicitly trained for. The precise nature of LLM +capabilities is often mysterious, and different prompts can elicit different +capabilities through in-context learning. We propose a framework that enables +us to analyze in-context learning dynamics to understand latent concepts +underlying LLMs' behavioral patterns. This provides a more nuanced +understanding than success-or-failure evaluation benchmarks, but does not +require observing internal activations as a mechanistic interpretation of +circuits would. Inspired by the cognitive science of human randomness +perception, we use random binary sequences as context and study dynamics of +in-context learning by manipulating properties of context data, such as +sequence length. In the latest GPT-3.5+ models, we find emergent abilities to +generate seemingly random numbers and learn basic formal languages, with +striking in-context learning dynamics where model outputs transition sharply +from seemingly random behaviors to deterministic repetition. + +
+
+
+
+
+ + ♻ ☆ DSI++: Updating Transformer Memory with New Documents EMNLP 2023 + + +
+ Differentiable Search Indices (DSIs) encode a corpus of documents in model +parameters and use the same model to answer user queries directly. Despite the +strong performance of DSI models, deploying them in situations where the corpus +changes over time is computationally expensive because reindexing the corpus +requires re-training the model. In this work, we introduce DSI++, a continual +learning challenge for DSI to incrementally index new documents while being +able to answer queries related to both previously and newly indexed documents. +Across different model scales and document identifier representations, we show +that continual indexing of new documents leads to considerable forgetting of +previously indexed documents. We also hypothesize and verify that the model +experiences forgetting events during training, leading to unstable learning. To +mitigate these issues, we investigate two approaches. The first focuses on +modifying the training dynamics. Flatter minima implicitly alleviate +forgetting, so we optimize for flatter loss basins and show that the model +stably memorizes more documents ($+12\%$). Next, we introduce a generative +memory to sample pseudo-queries for documents and supplement them during +continual indexing to prevent forgetting for the retrieval task. Extensive +experiments on novel continual indexing benchmarks based on Natural Questions +(NQ) and MS MARCO demonstrate that our proposed solution mitigates forgetting +significantly. Concretely, it improves the average Hits@10 by $+21.1\%$ over +competitive baselines for NQ and requires $6$ times fewer model updates +compared to re-training the DSI model for incrementally indexing five corpora +in a sequence. + +
+
+ comment: Accepted at EMNLP 2023 main conference +
+
+
+
+
+ + ♻ ☆ In-Context Demonstration Selection with Cross Entropy Difference + + +
+ Large language models (LLMs) can use in-context demonstrations to improve +performance on zero-shot tasks. However, selecting the best in-context examples +is challenging because model performance can vary widely depending on the +selected examples. We present a cross-entropy difference (CED) method for +selecting in-context demonstrations. Our method is based on the observation +that the effectiveness of in-context demonstrations negatively correlates with +the perplexity of the test example by a language model that was finetuned on +that demonstration. We utilize parameter efficient finetuning to train small +models on training data that are used for computing the cross-entropy +difference between a test example and every candidate in-context demonstration. +This metric is used to rank and select in-context demonstrations independently +for each test input. We evaluate our method on a mix-domain dataset that +combines 8 benchmarks, representing 4 text generation tasks, showing that CED +for in-context demonstration selection can improve performance for a variety of +LLMs. + +
+
+
+
+
+ + ♻ ☆ A Language Agent for Autonomous Driving + + +
+ Human-level driving is an ultimate goal of autonomous driving. Conventional +approaches formulate autonomous driving as a perception-prediction-planning +framework, yet their systems do not capitalize on the inherent reasoning +ability and experiential knowledge of humans. In this paper, we propose a +fundamental paradigm shift from current pipelines, exploiting Large Language +Models (LLMs) as a cognitive agent to integrate human-like intelligence into +autonomous driving systems. Our approach, termed Agent-Driver, transforms the +traditional autonomous driving pipeline by introducing a versatile tool library +accessible via function calls, a cognitive memory of common sense and +experiential knowledge for decision-making, and a reasoning engine capable of +chain-of-thought reasoning, task planning, motion planning, and +self-reflection. Powered by LLMs, our Agent-Driver is endowed with intuitive +common sense and robust reasoning capabilities, thus enabling a more nuanced, +human-like approach to autonomous driving. We evaluate our approach on the +large-scale nuScenes benchmark, and extensive experiments substantiate that our +Agent-Driver significantly outperforms the state-of-the-art driving methods by +a large margin. Our approach also demonstrates superior interpretability and +few-shot learning ability to these methods. Code will be released. + +
+
+ comment: Project Page: https://usc-gvl.github.io/Agent-Driver/ +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating + Video-based Large Language Models + + +
+ Video-based large language models (Video-LLMs) have been recently introduced, +targeting both fundamental improvements in perception and comprehension, and a +diverse range of user inquiries. In pursuit of the ultimate goal of achieving +artificial general intelligence, a truly intelligent Video-LLM model should not +only see and understand the surroundings, but also possess human-level +commonsense, and make well-informed decisions for the users. To guide the +development of such a model, the establishment of a robust and comprehensive +evaluation system becomes crucial. To this end, this paper proposes +\textit{Video-Bench}, a new comprehensive benchmark along with a toolkit +specifically designed for evaluating Video-LLMs. The benchmark comprises 10 +meticulously crafted tasks, evaluating the capabilities of Video-LLMs across +three distinct levels: Video-exclusive Understanding, Prior Knowledge-based +Question-Answering, and Comprehension and Decision-making. In addition, we +introduce an automatic toolkit tailored to process model outputs for various +tasks, facilitating the calculation of metrics and generating convenient final +scores. We evaluate 8 representative Video-LLMs using \textit{Video-Bench}. The +findings reveal that current Video-LLMs still fall considerably short of +achieving human-like comprehension and analysis of real-world videos, offering +valuable insights for future research directions. The benchmark and toolkit are +available at: \url{https://github.com/PKU-YuanGroup/Video-Bench}. + +
+
+ comment: Benchmark is available at + https://github.com/PKU-YuanGroup/Video-Bench +
+
+
+
+
+ + ☆ Test-time Adaptation of Discriminative Models via Diffusion Generative + Feedback NeurIPS 2023 + + +
+ The advancements in generative modeling, particularly the advent of diffusion +models, have sparked a fundamental question: how can these models be +effectively used for discriminative tasks? In this work, we find that +generative models can be great test-time adapters for discriminative models. +Our method, Diffusion-TTA, adapts pre-trained discriminative models such as +image classifiers, segmenters and depth predictors, to each unlabelled example +in the test set using generative feedback from a diffusion model. We achieve +this by modulating the conditioning of the diffusion model using the output of +the discriminative model. We then maximize the image likelihood objective by +backpropagating the gradients to discriminative model's parameters. We show +Diffusion-TTA significantly enhances the accuracy of various large-scale +pre-trained discriminative models, such as, ImageNet classifiers, CLIP models, +image pixel labellers and image depth predictors. Diffusion-TTA outperforms +existing test-time adaptation methods, including TTT-MAE and TENT, and +particularly shines in online adaptation setups, where the discriminative model +is continually adapted to each example in the test set. We provide access to +code, results, and visualizations on our website: +https://diffusion-tta.github.io/. + +
+
+ comment: Accepted at NeurIPS 2023 Webpage with Code: + https://diffusion-tta.github.io/ +
+
+
+
+
+ + ☆ How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for + Vision LLMs SC + + +
+ This work focuses on the potential of Vision LLMs (VLLMs) in visual +reasoning. Different from prior studies, we shift our focus from evaluating +standard performance to introducing a comprehensive safety evaluation suite, +covering both out-of-distribution (OOD) generalization and adversarial +robustness. For the OOD evaluation, we present two novel VQA datasets, each +with one variant, designed to test model performance under challenging +conditions. In exploring adversarial robustness, we propose a straightforward +attack strategy for misleading VLLMs to produce visual-unrelated responses. +Moreover, we assess the efficacy of two jailbreaking strategies, targeting +either the vision or language component of VLLMs. Our evaluation of 21 diverse +models, ranging from open-source VLLMs to GPT-4V, yields interesting +observations: 1) Current VLLMs struggle with OOD texts but not images, unless +the visual information is limited; and 2) These VLLMs can be easily misled by +deceiving vision encoders only, and their vision-language training often +compromise safety protocols. We release this safety evaluation suite at +https://github.com/UCSC-VLAA/vllm-safety-benchmark. + +
+
+ comment: H.T., C.C., and Z.W. contribute equally. Work done during H.T. and + Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC +
+
+
+
+
+ + ☆ GART: Gaussian Articulated Template Models + + +
+ We introduce Gaussian Articulated Template Model GART, an explicit, +efficient, and expressive representation for non-rigid articulated subject +capturing and rendering from monocular videos. GART utilizes a mixture of +moving 3D Gaussians to explicitly approximate a deformable subject's geometry +and appearance. It takes advantage of a categorical template model prior (SMPL, +SMAL, etc.) with learnable forward skinning while further generalizing to more +complex non-rigid deformations with novel latent bones. GART can be +reconstructed via differentiable rendering from monocular videos in seconds or +minutes and rendered in novel poses faster than 150fps. + +
+
+ comment: 13 pages, code available at + https://www.cis.upenn.edu/~leijh/projects/gart/ +
+
+
+
+
+ + ☆ On Bringing Robots Home + + +
+ Throughout history, we have successfully integrated various machines into our +homes. Dishwashers, laundry machines, stand mixers, and robot vacuums are a few +recent examples. However, these machines excel at performing only a single task +effectively. The concept of a "generalist machine" in homes - a domestic +assistant that can adapt and learn from our needs, all while remaining +cost-effective - has long been a goal in robotics that has been steadily +pursued for decades. In this work, we initiate a large-scale effort towards +this goal by introducing Dobb-E, an affordable yet versatile general-purpose +system for learning robotic manipulation within household settings. Dobb-E can +learn a new task with only five minutes of a user showing it how to do it, +thanks to a demonstration collection tool ("The Stick") we built out of cheap +parts and iPhones. We use the Stick to collect 13 hours of data in 22 homes of +New York City, and train Home Pretrained Representations (HPR). Then, in a +novel home environment, with five minutes of demonstrations and fifteen minutes +of adapting the HPR model, we show that Dobb-E can reliably solve the task on +the Stretch, a mobile robot readily available on the market. Across roughly 30 +days of experimentation in homes of New York City and surrounding areas, we +test our system in 10 homes, with a total of 109 tasks in different +environments, and finally achieve a success rate of 81%. Beyond success +percentages, our experiments reveal a plethora of unique challenges absent or +ignored in lab robotics. These range from effects of strong shadows, to +variable demonstration quality by non-expert users. With the hope of +accelerating research on home robots, and eventually seeing robot butlers in +every home, we open-source Dobb-E software stack and models, our data, and our +hardware designs at https://dobb-e.com + +
+
+ comment: Project website and videos are available at https://dobb-e.com, + technical documentation for getting started is available at + https://docs.dobb-e.com, and code is released at + https://github.com/notmahi/dobb-e +
+
+
+
+
+ + ☆ CG-HOI: Contact-Guided 3D Human-Object Interaction Generation + + +
+ We propose CG-HOI, the first method to address the task of generating dynamic +3D human-object interactions (HOIs) from text. We model the motion of both +human and object in an interdependent fashion, as semantically rich human +motion rarely happens in isolation without any interactions. Our key insight is +that explicitly modeling contact between the human body surface and object +geometry can be used as strong proxy guidance, both during training and +inference. Using this guidance to bridge human and object motion enables +generating more realistic and physically plausible interaction sequences, where +the human body and corresponding object move in a coherent manner. Our method +first learns to model human motion, object motion, and contact in a joint +diffusion process, inter-correlated through cross-attention. We then leverage +this learned contact for guidance during inference synthesis of realistic, +coherent HOIs. Extensive evaluation shows that our joint contact-based +human-object interaction approach generates realistic and physically plausible +sequences, and we show two applications highlighting the capabilities of our +method. Conditioned on a given object trajectory, we can generate the +corresponding human motion without re-training, demonstrating strong +human-object interdependency learning. Our approach is also flexible, and can +be applied to static real-world 3D scene scans. + +
+
+ comment: Project page: https://cg-hoi.christian-diller.de Video: + https://www.youtube.com/watch?v=GNyQwTwZ15s +
+
+
+
+
+ + ☆ Animatable Gaussians: Learning Pose-dependent Gaussian Maps for + High-fidelity Human Avatar Modeling + + +
+ Modeling animatable human avatars from RGB videos is a long-standing and +challenging problem. Recent works usually adopt MLP-based neural radiance +fields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to +regress pose-dependent garment details. To this end, we introduce Animatable +Gaussians, a new avatar representation that leverages powerful 2D CNNs and 3D +Gaussian splatting to create high-fidelity avatars. To associate 3D Gaussians +with the animatable avatar, we learn a parametric template from the input +videos, and then parameterize the template on two front \& back canonical +Gaussian maps where each pixel represents a 3D Gaussian. The learned template +is adaptive to the wearing garments for modeling looser clothes like dresses. +Such template-guided 2D parameterization enables us to employ a powerful +StyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling +detailed dynamic appearances. Furthermore, we introduce a pose projection +strategy for better generalization given novel poses. Overall, our method can +create lifelike avatars with dynamic, realistic and generalized appearances. +Experiments show that our method outperforms other state-of-the-art approaches. +Code: https://github.com/lizhe00/AnimatableGaussians + +
+
+ comment: Projectpage: https://animatable-gaussians.github.io/, Code: + https://github.com/lizhe00/AnimatableGaussians +
+
+
+
+
+ + ☆ Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person + Images + + +
+ Virtual try-on has become a popular research topic, but most existing methods +focus on studio images with a clean background. They can achieve plausible +results for this studio try-on setting by learning to warp a garment image to +fit a person's body from paired training data, i.e., garment images paired with +images of people wearing the same garment. Such data is often collected from +commercial websites, where each garment is demonstrated both by itself and on +several models. By contrast, it is hard to collect paired data for in-the-wild +scenes, and therefore, virtual try-on for casual images of people against +cluttered backgrounds is rarely studied. + In this work, we fill the gap in the current virtual try-on research by (1) +introducing a Street TryOn benchmark to evaluate performance on street scenes +and (2) proposing a novel method that can learn without paired data, from a set +of in-the-wild person images directly. Our method can achieve robust +performance across shop and street domains using a novel DensePose warping +correction method combined with diffusion-based inpainting controlled by pose +and semantic segmentation. Our experiments demonstrate competitive performance +for standard studio try-on tasks and SOTA performance for street try-on and +cross-domain try-on tasks. + +
+
+
+
+
+ + ☆ Interactive Autonomous Navigation with Internal State Inference and + Interactivity Estimation + + +
+ Deep reinforcement learning (DRL) provides a promising way for intelligent +agents (e.g., autonomous vehicles) to learn to navigate complex scenarios. +However, DRL with neural networks as function approximators is typically +considered a black box with little explainability and often suffers from +suboptimal performance, especially for autonomous navigation in highly +interactive multi-agent environments. To address these issues, we propose three +auxiliary tasks with spatio-temporal relational reasoning and integrate them +into the standard DRL framework, which improves the decision making performance +and provides explainable intermediate indicators. We propose to explicitly +infer the internal states (i.e., traits and intentions) of surrounding agents +(e.g., human drivers) as well as to predict their future trajectories in the +situations with and without the ego agent through counterfactual reasoning. +These auxiliary tasks provide additional supervision signals to infer the +behavior patterns of other interactive agents. Multiple variants of framework +integration strategies are compared. We also employ a spatio-temporal graph +neural network to encode relations between dynamic entities, which enhances +both internal state inference and decision making of the ego agent. Moreover, +we propose an interactivity estimation mechanism based on the difference +between predicted trajectories in these two situations, which indicates the +degree of influence of the ego agent on other agents. To validate the proposed +method, we design an intersection driving simulator based on the Intelligent +Intersection Driver Model (IIDM) that simulates vehicles and pedestrians. Our +approach achieves robust and state-of-the-art performance in terms of standard +evaluation metrics and provides explainable intermediate indicators (i.e., +internal states, and interactivity scores) for decision making. + +
+
+ comment: 18 pages, 14 figures +
+
+
+
+
+ + ☆ Self-correcting LLM-controlled Diffusion Models + + +
+ Text-to-image generation has witnessed significant progress with the advent +of diffusion models. Despite the ability to generate photorealistic images, +current text-to-image diffusion models still often struggle to accurately +interpret and follow complex input text prompts. In contrast to existing models +that aim to generate images only with their best effort, we introduce +Self-correcting LLM-controlled Diffusion (SLD). SLD is a framework that +generates an image from the input prompt, assesses its alignment with the +prompt, and performs self-corrections on the inaccuracies in the generated +image. Steered by an LLM controller, SLD turns text-to-image generation into an +iterative closed-loop process, ensuring correctness in the resulting image. SLD +is not only training-free but can also be seamlessly integrated with diffusion +models behind API access, such as DALL-E 3, to further boost the performance of +state-of-the-art diffusion models. Experimental results show that our approach +can rectify a majority of incorrect generations, particularly in generative +numeracy, attribute binding, and spatial relationships. Furthermore, by simply +adjusting the instructions to the LLM, SLD can perform image editing tasks, +bridging the gap between text-to-image generation and image editing pipelines. +We will make our code available for future research and applications. + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ ViT-Lens-2: Gateway to Omni-modal Intelligence + + +
+ Aiming to advance AI agents, large foundation models significantly improve +reasoning and instruction execution, yet the current focus on vision and +language neglects the potential of perceiving diverse modalities in open-world +environments. However, the success of data-driven vision and language models is +costly or even infeasible to be reproduced for rare modalities. In this paper, +we present ViT-Lens-2 that facilitates efficient omni-modal representation +learning by perceiving novel modalities with a pretrained ViT and aligning them +to a pre-defined space. Specifically, the modality-specific lens is tuned to +project any-modal signals to an intermediate embedding space, which are then +processed by a strong ViT with pre-trained visual knowledge. The encoded +representations are optimized toward aligning with the modal-independent space, +pre-defined by off-the-shelf foundation models. ViT-Lens-2 provides a unified +solution for representation learning of increasing modalities with two +appealing advantages: (i) Unlocking the great potential of pretrained ViTs to +novel modalities effectively with efficient data regime; (ii) Enabling emergent +downstream capabilities through modality alignment and shared ViT parameters. +We tailor ViT-Lens-2 to learn representations for 3D point cloud, depth, audio, +tactile and EEG, and set new state-of-the-art results across various +understanding tasks, such as zero-shot classification. By seamlessly +integrating ViT-Lens-2 into Multimodal Foundation Models, we enable +Any-modality to Text and Image Generation in a zero-shot manner. Code and +models are available at https://github.com/TencentARC/ViT-Lens. + +
+
+ comment: This work is a follow-up of "ViT-Lens: Towards Omni-modal + Representations". arXiv admin note: text overlap with arXiv:2308.10185 +
+
+
+
+
+ + ☆ DiffSLVA: Harnessing Diffusion Models for Sign Language Video + Anonymization + + +
+ Since American Sign Language (ASL) has no standard written form, Deaf signers +frequently share videos in order to communicate in their native language. +However, since both hands and face convey critical linguistic information in +signed languages, sign language videos cannot preserve signer privacy. While +signers have expressed interest, for a variety of applications, in sign +language video anonymization that would effectively preserve linguistic +content, attempts to develop such technology have had limited success, given +the complexity of hand movements and facial expressions. Existing approaches +rely predominantly on precise pose estimations of the signer in video footage +and often require sign language video datasets for training. These requirements +prevent them from processing videos 'in the wild,' in part because of the +limited diversity present in current sign language video datasets. To address +these limitations, our research introduces DiffSLVA, a novel methodology that +utilizes pre-trained large-scale diffusion models for zero-shot text-guided +sign language video anonymization. We incorporate ControlNet, which leverages +low-level image features such as HED (Holistically-Nested Edge Detection) +edges, to circumvent the need for pose estimation. Additionally, we develop a +specialized module dedicated to capturing facial expressions, which are +critical for conveying essential linguistic information in signed languages. We +then combine the above methods to achieve anonymization that better preserves +the essential linguistic content of the original signer. This innovative +methodology makes possible, for the first time, sign language video +anonymization that could be used for real-world applications, which would offer +significant benefits to the Deaf and Hard-of-Hearing communities. We +demonstrate the effectiveness of our approach with a series of signer +anonymization experiments. + +
+
+ comment: Project webpage: https://github.com/Jeffery9707/DiffSLVA +
+
+
+
+
+ + ☆ Exploring Attribute Variations in Style-based GANs using Diffusion + Models + + +
+ Existing attribute editing methods treat semantic attributes as binary, +resulting in a single edit per attribute. However, attributes such as +eyeglasses, smiles, or hairstyles exhibit a vast range of diversity. In this +work, we formulate the task of \textit{diverse attribute editing} by modeling +the multidimensional nature of attribute edits. This enables users to generate +multiple plausible edits per attribute. We capitalize on disentangled latent +spaces of pretrained GANs and train a Denoising Diffusion Probabilistic Model +(DDPM) to learn the latent distribution for diverse edits. Specifically, we +train DDPM over a dataset of edit latent directions obtained by embedding image +pairs with a single attribute change. This leads to latent subspaces that +enable diverse attribute editing. Applying diffusion in the highly compressed +latent space allows us to model rich distributions of edits within limited +computational resources. Through extensive qualitative and quantitative +experiments conducted across a range of datasets, we demonstrate the +effectiveness of our approach for diverse attribute editing. We also showcase +the results of our method applied for 3D editing of various face attributes. + +
+
+ comment: Neurips Workshop on Diffusion Models 2023 +
+
+
+
+
+ + ☆ Relightable 3D Gaussian: Real-time Point Cloud Relighting with BRDF + Decomposition and Ray Tracing + + +
+ We present a novel differentiable point-based rendering framework for +material and lighting decomposition from multi-view images, enabling editing, +ray-tracing, and real-time relighting of the 3D point cloud. Specifically, a 3D +scene is represented as a set of relightable 3D Gaussian points, where each +point is additionally associated with a normal direction, BRDF parameters, and +incident lights from different directions. To achieve robust lighting +estimation, we further divide incident lights of each point into global and +local components, as well as view-dependent visibilities. The 3D scene is +optimized through the 3D Gaussian Splatting technique while BRDF and lighting +are decomposed by physically-based differentiable rendering. Moreover, we +introduce an innovative point-based ray-tracing approach based on the bounding +volume hierarchy for efficient visibility baking, enabling real-time rendering +and relighting of 3D Gaussian points with accurate shadow effects. Extensive +experiments demonstrate improved BRDF estimation and novel view rendering +results compared to state-of-the-art material estimation approaches. Our +framework showcases the potential to revolutionize the mesh-based graphics +pipeline with a relightable, traceable, and editable rendering pipeline solely +based on point cloud. Project +page:https://nju-3dv.github.io/projects/Relightable3DGaussian/. + +
+
+
+
+
+ + ☆ Weakly-Supervised 3D Reconstruction of Clothed Humans via Normal Maps + + +
+ We present a novel deep learning-based approach to the 3D reconstruction of +clothed humans using weak supervision via 2D normal maps. Given a single RGB +image or multiview images, our network infers a signed distance function (SDF) +discretized on a tetrahedral mesh surrounding the body in a rest pose. +Subsequently, inferred pose and camera parameters are used to generate a normal +map from the SDF. A key aspect of our approach is the use of Marching +Tetrahedra to (uniquely) compute a triangulated surface from the SDF on the +tetrahedral mesh, facilitating straightforward differentiation (and thus +backpropagation). Thus, given only ground truth normal maps (with no volumetric +information ground truth information), we can train the network to produce SDF +values from corresponding RGB images. Optionally, an additional multiview loss +leads to improved results. We demonstrate the efficacy of our approach for both +network inference and 3D reconstruction. + +
+
+
+
+
+ + ☆ OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving + + +
+ Understanding how the 3D scene evolves is vital for making decisions in +autonomous driving. Most existing methods achieve this by predicting the +movements of object boxes, which cannot capture more fine-grained scene +information. In this paper, we explore a new framework of learning a world +model, OccWorld, in the 3D Occupancy space to simultaneously predict the +movement of the ego car and the evolution of the surrounding scenes. We propose +to learn a world model based on 3D occupancy rather than 3D bounding boxes and +segmentation maps for three reasons: 1) expressiveness. 3D occupancy can +describe the more fine-grained 3D structure of the scene; 2) efficiency. 3D +occupancy is more economical to obtain (e.g., from sparse LiDAR points). 3) +versatility. 3D occupancy can adapt to both vision and LiDAR. To facilitate the +modeling of the world evolution, we learn a reconstruction-based scene +tokenizer on the 3D occupancy to obtain discrete scene tokens to describe the +surrounding scenes. We then adopt a GPT-like spatial-temporal generative +transformer to generate subsequent scene and ego tokens to decode the future +occupancy and ego trajectory. Extensive experiments on the widely used nuScenes +benchmark demonstrate the ability of OccWorld to effectively model the +evolution of the driving scenes. OccWorld also produces competitive planning +results without using instance and map supervision. Code: +https://github.com/wzzheng/OccWorld. + +
+
+ comment: Code is available at: https://github.com/wzzheng/OccWorld +
+
+
+
+
+ + ☆ GaussianEditor: Editing 3D Gaussians Delicately with Text Instructions + + +
+ Recently, impressive results have been achieved in 3D scene editing with text +instructions based on a 2D diffusion model. However, current diffusion models +primarily generate images by predicting noise in the latent space, and the +editing is usually applied to the whole image, which makes it challenging to +perform delicate, especially localized, editing for 3D scenes. Inspired by +recent 3D Gaussian splatting, we propose a systematic framework, named +GaussianEditor, to edit 3D scenes delicately via 3D Gaussians with text +instructions. Benefiting from the explicit property of 3D Gaussians, we design +a series of techniques to achieve delicate editing. Specifically, we first +extract the region of interest (RoI) corresponding to the text instruction, +aligning it to 3D Gaussians. The Gaussian RoI is further used to control the +editing process. Our framework can achieve more delicate and precise editing of +3D scenes than previous methods while enjoying much faster training speed, i.e. +within 20 minutes on a single V100 GPU, more than twice as fast as +Instruct-NeRF2NeRF (45 minutes -- 2 hours). + +
+
+ comment: Project page: https://GaussianEditor.github.io +
+
+
+
+
+ + ☆ Automated Measurement of Vascular Calcification in Femoral + Endarterectomy Patients Using Deep Learning + + +
+ Atherosclerosis, a chronic inflammatory disease affecting the large arteries, +presents a global health risk. Accurate analysis of diagnostic images, like +computed tomographic angiograms (CTAs), is essential for staging and monitoring +the progression of atherosclerosis-related conditions, including peripheral +arterial disease (PAD). However, manual analysis of CTA images is +time-consuming and tedious. To address this limitation, we employed a deep +learning model to segment the vascular system in CTA images of PAD patients +undergoing femoral endarterectomy surgery and to measure vascular calcification +from the left renal artery to the patella. Utilizing proprietary CTA images of +27 patients undergoing femoral endarterectomy surgery provided by Prisma Health +Midlands, we developed a Deep Neural Network (DNN) model to first segment the +arterial system, starting from the descending aorta to the patella, and second, +to provide a metric of arterial calcification. Our designed DNN achieved 83.4% +average Dice accuracy in segmenting arteries from aorta to patella, advancing +the state-of-the-art by 0.8%. Furthermore, our work is the first to present a +robust statistical analysis of automated calcification measurement in the lower +extremities using deep learning, attaining a Mean Absolute Percentage Error +(MAPE) of 9.5% and a correlation coefficient of 0.978 between automated and +manual calcification scores. These findings underscore the potential of deep +learning techniques as a rapid and accurate tool for medical professionals to +assess calcification in the abdominal aorta and its branches above the patella. +The developed DNN model and related documentation in this project are available +at GitHub page at https://github.com/pip-alireza/DeepCalcScoring. + +
+
+ comment: Published in MDPI Diagnostic journal, the code can be accessed via + the GitHub link in the paper +
+
+
+
+
+ + ☆ Adversaral Doodles: Interpretable and Human-drawable Attacks Provide + Describable Insights CVPR 2024 + + +
+ DNN-based image classification models are susceptible to adversarial attacks. +Most previous adversarial attacks do not focus on the interpretability of the +generated adversarial examples, and we cannot gain insights into the mechanism +of the target classifier from the attacks. Therefore, we propose Adversarial +Doodles, which have interpretable shapes. We optimize black b\'ezier curves to +fool the target classifier by overlaying them onto the input image. By +introducing random perspective transformation and regularizing the doodled +area, we obtain compact attacks that cause misclassification even when humans +replicate them by hand. Adversarial doodles provide describable and intriguing +insights into the relationship between our attacks and the classifier's output. +We utilize adversarial doodles and discover the bias inherent in the target +classifier, such as "We add two strokes on its head, a triangle onto its body, +and two lines inside the triangle on a bird image. Then, the classifier +misclassifies the image as a butterfly." + +
+
+ comment: Submitted to CVPR 2024 +
+
+
+
+
+ + ☆ Unified Batch Normalization: Identifying and Alleviating the Feature + Condensation in Batch Normalization and a Unified Framework + + +
+ Batch Normalization (BN) has become an essential technique in contemporary +neural network design, enhancing training stability. Specifically, BN employs +centering and scaling operations to standardize features along the batch +dimension and uses an affine transformation to recover features. Although +standard BN has shown its capability to improve deep neural network training +and convergence, it still exhibits inherent limitations in certain cases. Most +existing techniques that enhance BN consider a single or a few aspects of BN. +In this paper, we first identify problems with BN from a feature perspective +and explore that feature condensation exists in the learning when employing BN, +which negatively affects testing performance. To tackle this problem, we +propose a two-stage unified framework called Unified Batch Normalization (UBN). +In the first stage, we utilize a simple feature condensation threshold to +alleviate the feature condensation, which hinders inappropriate statistic +updates in normalization. In the second stage, we unify various normalization +variants to boost each component of BN. Our experimental results reveal that +UBN significantly enhances performance across different visual backbones and +notably expedites network training convergence, particularly in early training +stages. Notably, our method improved about 3% in top-1 accuracy on ImageNet +classification with large batch sizes, showing the effectiveness of our +approach in real-world scenarios. + +
+
+
+
+
+ + ☆ DiffAnt: Diffusion Models for Action Anticipation + + +
+ Anticipating future actions is inherently uncertain. Given an observed video +segment containing ongoing actions, multiple subsequent actions can plausibly +follow. This uncertainty becomes even larger when predicting far into the +future. However, the majority of existing action anticipation models adhere to +a deterministic approach, neglecting to account for future uncertainties. In +this work, we rethink action anticipation from a generative view, employing +diffusion models to capture different possible future actions. In this +framework, future actions are iteratively generated from standard Gaussian +noise in the latent space, conditioned on the observed video, and subsequently +transitioned into the action space. Extensive experiments on four benchmark +datasets, i.e., Breakfast, 50Salads, EpicKitchens, and EGTEA Gaze+, are +performed and the proposed method achieves superior or comparable results to +state-of-the-art methods, showing the effectiveness of a generative approach +for action anticipation. Our code and trained models will be published on +GitHub. + +
+
+
+
+
+ + ☆ Direct2.5: Diverse Text-to-3D Generation via Multi-view 2.5D Diffusion + + +
+ Recent advances in generative AI have unveiled significant potential for the +creation of 3D content. However, current methods either apply a pre-trained 2D +diffusion model with the time-consuming score distillation sampling (SDS), or a +direct 3D diffusion model trained on limited 3D data losing generation +diversity. In this work, we approach the problem by employing a multi-view 2.5D +diffusion fine-tuned from a pre-trained 2D diffusion model. The multi-view 2.5D +diffusion directly models the structural distribution of 3D data, while still +maintaining the strong generalization ability of the original 2D diffusion +model, filling the gap between 2D diffusion-based and direct 3D diffusion-based +methods for 3D content generation. During inference, multi-view normal maps are +generated using the 2.5D diffusion, and a novel differentiable rasterization +scheme is introduced to fuse the almost consistent multi-view normal maps into +a consistent 3D model. We further design a normal-conditioned multi-view image +generation module for fast appearance generation given the 3D geometry. Our +method is a one-pass diffusion process and does not require any SDS +optimization as post-processing. We demonstrate through extensive experiments +that, our direct 2.5D generation with the specially-designed fusion scheme can +achieve diverse, mode-seeking-free, and high-fidelity 3D content generation in +only 10 seconds. Project page: https://nju-3dv.github.io/projects/direct25. + +
+
+ comment: Project webpage: https://nju-3dv.github.io/projects/direct25 +
+
+
+
+
+ + ☆ Text2Loc: 3D Point Cloud Localization from Natural Language + + +
+ We tackle the problem of 3D point cloud localization based on a few natural +linguistic descriptions and introduce a novel neural network, Text2Loc, that +fully interprets the semantic relationship between points and text. Text2Loc +follows a coarse-to-fine localization pipeline: text-submap global place +recognition, followed by fine localization. In global place recognition, +relational dynamics among each textual hint are captured in a hierarchical +transformer with max-pooling (HTM), whereas a balance between positive and +negative pairs is maintained using text-submap contrastive learning. Moreover, +we propose a novel matching-free fine localization method to further refine the +location predictions, which completely removes the need for complicated +text-instance matching and is lighter, faster, and more accurate than previous +methods. Extensive experiments show that Text2Loc improves the localization +accuracy by up to $2\times$ over the state-of-the-art on the KITTI360Pose +dataset. We will make the code publicly available. + +
+
+ comment: 10 pages, 6 figures, 6 tables +
+
+
+
+
+ + ☆ FALCON: Fairness Learning via Contrastive Attention Approach to + Continual Semantic Scene Understanding in Open World + + +
+ Continual Learning in semantic scene segmentation aims to continually learn +new unseen classes in dynamic environments while maintaining previously learned +knowledge. Prior studies focused on modeling the catastrophic forgetting and +background shift challenges in continual learning. However, fairness, another +major challenge that causes unfair predictions leading to low performance among +major and minor classes, still needs to be well addressed. In addition, prior +methods have yet to model the unknown classes well, thus resulting in producing +non-discriminative features among unknown classes. This paper presents a novel +Fairness Learning via Contrastive Attention Approach to continual learning in +semantic scene understanding. In particular, we first introduce a new Fairness +Contrastive Clustering loss to address the problems of catastrophic forgetting +and fairness. Then, we propose an attention-based visual grammar approach to +effectively model the background shift problem and unknown classes, producing +better feature representations for different unknown classes. Through our +experiments, our proposed approach achieves State-of-the-Art (SOTA) performance +on different continual learning settings of three standard benchmarks, i.e., +ADE20K, Cityscapes, and Pascal VOC. It promotes the fairness of the continual +semantic segmentation model. + +
+
+
+
+
+ + ☆ Efficient Pre-training for Localized Instruction Generation of Videos + + +
+ Procedural videos show step-by-step demonstrations of tasks like recipe +preparation. Understanding such videos is challenging, involving the precise +localization of steps and the generation of textual instructions. Manually +annotating steps and writing instructions is costly, which limits the size of +current datasets and hinders effective learning. Leveraging large but noisy +video-transcript datasets for pre-training can boost performance, but demands +significant computational resources. Furthermore, transcripts contain +irrelevant content and exhibit style variation compared to instructions written +by human annotators. To mitigate both issues, we propose a technique, +Sieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters +irrelevant transcripts and (ii) Swap enhances the quality of the text +instruction by automatically replacing the transcripts with human-written +instructions from a text-only recipe dataset. The curated dataset, three orders +of magnitude smaller than current web-scale datasets, enables efficient +training of large-scale models with competitive performance. We complement our +Sieve-\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step +localization and instruction generation for procedural videos. When this model +is pre-trained on our curated dataset, it achieves state-of-the-art performance +in zero-shot and finetuning settings on YouCook2 and Tasty, while using a +fraction of the computational resources. + +
+
+
+
+
+ + ☆ From Pixels to Titles: Video Game Identification by Screenshots using + Convolutional Neural Networks + + +
+ This paper investigates video game identification through single screenshots, +utilizing five convolutional neural network (CNN) architectures (MobileNet, +DenseNet, EfficientNetB0, EfficientNetB2, and EfficientNetB3) across 22 home +console systems, spanning from Atari 2600 to PlayStation 5. Confirming the +hypothesis, CNNs autonomously extract image features, enabling the +identification of game titles from screenshots without additional features. +Using ImageNet pre-trained weights, EfficientNetB3 achieves the highest average +accuracy (74.51%), while DenseNet169 excels in 14 of the 22 systems. Employing +alternative initial weights from another screenshots dataset boosts accuracy +for EfficientNetB2 and EfficientNetB3, with the latter reaching a peak accuracy +of 76.36% and demonstrating reduced convergence epochs from 23.7 to 20.5 on +average. Overall, the combination of optimal architecture and weights attains +77.67% accuracy, primarily led by EfficientNetB3 in 19 systems. These findings +underscore the efficacy of CNNs in video game identification through +screenshots. + +
+
+
+
+
+ + ☆ Tell2Design: A Dataset for Language-Guided Floor Plan Generation ACL2023 + + +
+ We consider the task of generating designs directly from natural language +descriptions, and consider floor plan generation as the initial research area. +Language conditional generative models have recently been very successful in +generating high-quality artistic images. However, designs must satisfy +different constraints that are not present in generating artistic images, +particularly spatial and relational constraints. We make multiple contributions +to initiate research on this task. First, we introduce a novel dataset, +\textit{Tell2Design} (T2D), which contains more than $80k$ floor plan designs +associated with natural language instructions. Second, we propose a +Sequence-to-Sequence model that can serve as a strong baseline for future +research. Third, we benchmark this task with several text-conditional image +generation models. We conclude by conducting human evaluations on the generated +samples and providing an analysis of human performance. We hope our +contributions will propel the research on language-guided design generation +forward. + +
+
+ comment: Paper published in ACL2023; Area Chair Award; Best Paper Nomination +
+
+
+
+
+ + ☆ Unleashing the Power of Prompt-driven Nucleus Instance Segmentation + + +
+ Nuclear instance segmentation in histology images is crucial for a broad +spectrum of clinical applications. Current prevailing nuclear instance +segmentation algorithms rely on regression of nuclei contours, distance maps, +watershed markers or a proxy nuclear representation of star-convex polygons. +Consequently, these methods necessitate sophisticated post-processing +operations to distinguish nuclei instances, which are commonly acknowledged to +be error-prone and parameter-sensitive. Recently, the segment anything model +(SAM) has earned attracted huge attention within the domain of medical image +segmentation due to its impressive generalization ability and promptable +property. Nevertheless, its potential on nuclear instance segmentation remains +largely underexplored. In this paper, we present a novel prompt-driven +framework that consists of a point prompter and a SAM for automatic nuclei +instance segmentation. Specifically, the prompter learns to generate a unique +point prompt for each nucleus while the SAM is fine tuned to output the +corresponding mask of the cued nucleus. Furthermore, we propose to add adjacent +nuclei as negative prompts to promote the model's ability to recognize +overlapping nuclei. Without bells and whistles, our proposed method sets a new +state-of-the-art performance on three challenging benchmarks. Our code is +available at +\textcolor{magenta}{\url{https://github.com/windygoo/PromptNucSeg}} . + +
+
+
+
+
+ + ☆ Optimal Transport Aggregation for Visual Place Recognition + + +
+ The task of Visual Place Recognition (VPR) aims to match a query image +against references from an extensive database of images from different places, +relying solely on visual cues. State-of-the-art pipelines focus on the +aggregation of features extracted from a deep backbone, in order to form a +global descriptor for each image. In this context, we introduce SALAD (Sinkhorn +Algorithm for Locally Aggregated Descriptors), which reformulates NetVLAD's +soft-assignment of local features to clusters as an optimal transport problem. +In SALAD, we consider both feature-to-cluster and cluster-to-feature relations +and we also introduce a 'dustbin' cluster, designed to selectively discard +features deemed non-informative, enhancing the overall descriptor quality. +Additionally, we leverage and fine-tune DINOv2 as a backbone, which provides +enhanced description power for the local features, and dramatically reduces the +required training time. As a result, our single-stage method not only surpasses +single-stage baselines in public VPR datasets, but also surpasses two-stage +methods that add a re-ranking with significantly higher cost. Code and models +are available at https://github.com/serizba/salad. + +
+
+
+
+
+ + ☆ ADM-Loc: Actionness Distribution Modeling for Point-supervised Temporal + Action Localization + + +
+ This paper addresses the challenge of point-supervised temporal action +detection, in which only one frame per action instance is annotated in the +training set. Self-training aims to provide supplementary supervision for the +training process by generating pseudo-labels (action proposals) from a base +model. However, most current methods generate action proposals by applying +manually designed thresholds to action classification probabilities and +treating adjacent snippets as independent entities. As a result, these methods +struggle to generate complete action proposals, exhibit sensitivity to +fluctuations in action classification scores, and generate redundant and +overlapping action proposals. This paper proposes a novel framework termed +ADM-Loc, which stands for Actionness Distribution Modeling for point-supervised +action Localization. ADM-Loc generates action proposals by fitting a composite +distribution, comprising both Gaussian and uniform distributions, to the action +classification signals. This fitting process is tailored to each action class +present in the video and is applied separately for each action instance, +ensuring the distinctiveness of their distributions. ADM-Loc significantly +enhances the alignment between the generated action proposals and ground-truth +action instances and offers high-quality pseudo-labels for self-training. +Moreover, to model action boundary snippets, it enforces consistency in action +classification scores during training by employing Gaussian kernels, supervised +with the proposed loss functions. ADM-Loc outperforms the state-of-the-art +point-supervised methods on THUMOS14 and ActivityNet-v1.2 datasets. + +
+
+
+
+
+ + ☆ Computer Vision for Carriers: PATRIOT + + +
+ Deck tracking performed on carriers currently involves a team of sailors +manually identifying aircraft and updating a digital user interface called the +Ouija Board. Improvements to the deck tracking process would result in +increased Sortie Generation Rates, and therefore applying automation is seen as +a critical method to improve deck tracking. However, the requirements on a +carrier ship do not allow for the installation of hardware-based location +sensing technologies like Global Positioning System (GPS) sensors. PATRIOT +(Panoramic Asset Tracking of Real-Time Information for the Ouija Tabletop) is a +research effort and proposed solution to performing deck tracking with passive +sensing and without the need for GPS sensors. PATRIOT is a prototype system +which takes existing camera feeds, calculates aircraft poses, and updates a +virtual Ouija board interface with the current status of the assets. PATRIOT +would allow for faster, more accurate, and less laborious asset tracking for +aircraft, people, and support equipment. PATRIOT is anticipated to benefit the +warfighter by reducing cognitive workload, reducing manning requirements, +collecting data to improve logistics, and enabling an automation gateway for +future efforts to improve efficiency and safety. The authors have developed and +tested algorithms to perform pose estimations of assets in real-time including +OpenPifPaf, High-Resolution Network (HRNet), HigherHRNet (HHRNet), Faster +R-CNN, and in-house developed encoder-decoder network. The software was tested +with synthetic and real-world data and was able to accurately extract the pose +of assets. Fusion, tracking, and real-world generality are planned to be +improved to ensure a successful transition to the fleet. + +
+
+ comment: 8 pages, 18 figures. Published in the Proceedings of the ASNE 2023 + Technology, Systems & Ships Symposium. Reproduced with permission from the + American Society of Naval Engineers. Distribution Statement A: Approved for + public release; distribution is unlimited, as submitted under NAVAIR Public + Release Authorization 2023-019 +
+
+
+
+
+ + ☆ LIFT OFF: LoRaWAN Installation and Fiducial Tracking Operations for the + Flightline of the Future + + +
+ Real-time situational awareness for the location of assets is critical to +ensure missions are completed efficiently and requirements are satisfied. In +many commercial settings, the application of global positioning system (GPS) +sensors is appropriate to achieve timely knowledge of the position of people +and equipment. However, GPS sensors are not appropriate for all situations due +to flight clearance and operations security concerns. LIFT OFF: LoRaWAN +Installation and Fiducial Tracking Operations for the Flightline of the Future +proposes a hybrid framework solution to achieve real-time situational awareness +for people, support equipment, and aircraft positions regardless of the +environment. This framework included a machine-vision component, which involved +setting up cameras to detect AprilTag decals that were installed on the sides +of aircraft. The framework included a geolocation sensor component, which +involved installing GPS sensors on support equipment and helmets. The framework +also included creating a long-range wide area network (LoRaWAN) to transfer +data and developing a user interface to display the data. The framework was +tested at Naval Air Station Oceana Flightline, the United States Naval Test +Pilot School, and at Naval Air Warfare Center Aircraft Division Lakehurst. LIFT +OFF successfully provided a real-time updating map of all tracked assets using +GPS sensors for people and support equipment and with visual fiducials for +aircraft. The trajectories of the assets were recorded for logistical analysis +and playback. Future follow-on work is anticipated to apply the technology to +other environments including carriers and amphibious assault ships in addition +to the flightline. + +
+
+ comment: 6 pages, 11 figures. Published in the Proceedings of the ASNE 2023 + Technology, Systems & Ships Symposium. Reproduced with permission from the + American Society of Naval Engineers. Distribution Statement A: Approved for + public release; distribution is unlimited, as submitted under NAVAIR Public + Release Authorization 2023-020 +
+
+
+
+
+ + ☆ Enhancing Perceptual Quality in Video Super-Resolution through + Temporally-Consistent Detail Synthesis using Diffusion Models + + +
+ In this paper, we address the problem of video super-resolution (VSR) using +Diffusion Models (DM), and present StableVSR. Our method significantly enhances +the perceptual quality of upscaled videos by synthesizing realistic and +temporally-consistent details. We turn a pre-trained DM for single image +super-resolution into a VSR method by introducing the Temporal Conditioning +Module (TCM). TCM uses Temporal Texture Guidance, which provides +spatially-aligned and detail-rich texture information synthesized in adjacent +frames. This guides the generative process of the current frame toward +high-quality and temporally-consistent results. We introduce a Frame-wise +Bidirectional Sampling strategy to encourage the use of information from past +to future and vice-versa. This strategy improves the perceptual quality of the +results and the temporal consistency across frames. We demonstrate the +effectiveness of StableVSR in enhancing the perceptual quality of upscaled +videos compared to existing state-of-the-art methods for VSR. The code is +available at https://github.com/claudiom4sir/StableVSR. + +
+
+
+
+
+ + ☆ MetaDefa: Meta-learning based on Domain Enhancement and Feature + Alignment for Single Domain Generalization + + +
+ The single domain generalization(SDG) based on meta-learning has emerged as +an effective technique for solving the domain-shift problem. However, the +inadequate match of data distribution between source and augmented domains and +difficult separation of domain-invariant features from domain-related features +make SDG model hard to achieve great generalization. Therefore, a novel +meta-learning method based on domain enhancement and feature alignment +(MetaDefa) is proposed to improve the model generalization performance. First, +the background substitution and visual corruptions techniques are used to +generate diverse and effective augmented domains. Then, the multi-channel +feature alignment module based on class activation maps and class agnostic +activation maps is designed to effectively extract adequate transferability +knowledge. In this module, domain-invariant features can be fully explored by +focusing on similar target regions between source and augmented domains feature +space and suppressing the feature representation of non-similar target regions. +Extensive experiments on two publicly available datasets show that MetaDefa has +significant generalization performance advantages in unknown multiple target +domains. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Data Generation for Post-OCR correction of Cyrillic handwriting + + +
+ This paper introduces a novel approach to post-Optical Character Recognition +Correction (POC) for handwritten Cyrillic text, addressing a significant gap in +current research methodologies. This gap is due to the lack of large text +corporas that provide OCR errors for further training of language-based POC +models, which are demanding in terms of corpora size. Our study primarily +focuses on the development and application of a synthetic handwriting +generation engine based on B\'ezier curves. Such an engine generates highly +realistic handwritten text in any amounts, which we utilize to create a +substantial dataset by transforming Russian text corpora sourced from the +internet. We apply a Handwritten Text Recognition (HTR) model to this dataset +to identify OCR errors, forming the basis for our POC model training. The +correction model is trained on a 90-symbol input context, utilizing a +pre-trained T5 architecture with a seq2seq correction task. We evaluate our +approach on HWR200 and School_notebooks_RU datasets as they provide significant +challenges in the HTR domain. Furthermore, POC can be used to highlight errors +for teachers, evaluating student performance. This can be done simply by +comparing sentences before and after correction, displaying differences in +text. Our primary contribution lies in the innovative use of B\'ezier curves +for Cyrillic text generation and subsequent error correction using a +specialized POC model. We validate our approach by presenting Word Accuracy +Rate (WAR) and Character Accuracy Rate (CAR) results, both with and without +post-OCR correction, using real open corporas of handwritten Cyrillic text. +These results, coupled with our methodology, are designed to be reproducible, +paving the way for further advancements in the field of OCR and handwritten +text analysis. Paper contributions can be found in +https://github.com/dbrainio/CyrillicHandwritingPOC + +
+
+ comment: 17 pages, 27 figures, 6 tables, 26 references +
+
+
+
+
+ + ☆ Stability-Informed Initialization of Neural Ordinary Differential + Equations + + +
+ This paper addresses the training of Neural Ordinary Differential Equations +(neural ODEs), and in particular explores the interplay between numerical +integration techniques, stability regions, step size, and initialization +techniques. It is shown how the choice of integration technique implicitly +regularizes the learned model, and how the solver's corresponding stability +region affects training and prediction performance. From this analysis, a +stability-informed parameter initialization technique is introduced. The +effectiveness of the initialization method is displayed across several learning +benchmarks and industrial applications. + +
+
+
+
+
+ + ☆ EVCap: Retrieval-Augmented Image Captioning with External Visual-Name + Memory for Open-World Comprehension + + +
+ Large language models (LLMs)-based image captioning has the capability of +describing objects not explicitly observed in training data; yet novel objects +occur frequently, necessitating the requirement of sustaining up-to-date object +knowledge for open-world comprehension. Instead of relying on large amounts of +data and scaling up network parameters, we introduce a highly effective +retrieval-augmented image captioning method that prompts LLMs with object names +retrieved from External Visual--name memory (EVCap). We build ever-changing +object knowledge memory using objects' visuals and names, enabling us to (i) +update the memory at a minimal cost and (ii) effortlessly augment LLMs with +retrieved object names utilizing a lightweight and fast-to-train model. Our +model, which was trained only on the COCO dataset, can be adapted to out-domain +data without additional fine-tuning or retraining. Our comprehensive +experiments conducted on various benchmarks and synthetic commonsense-violating +data demonstrate that EVCap, comprising solely 3.97M trainable parameters, +exhibits superior performance compared to other methods of equivalent model +size scale. Notably, it achieves competitive performance against specialist +SOTAs with an enormous number of parameters. Our code is available at +https://jiaxuan-li.github.io/EVCap. + +
+
+ comment: Project page: https://jiaxuan-li.github.io/EVCap +
+
+
+
+
+ + ☆ RO-LLaMA: Generalist LLM for Radiation Oncology via Noise Augmentation + and Consistency Regularization + + +
+ Recent advancements in Artificial Intelligence (AI) have profoundly +influenced medical fields, by providing tools to reduce clinical workloads. +However, most AI models are constrained to execute uni-modal tasks, in stark +contrast to the comprehensive approaches utilized by medical professionals. To +address this, here we present RO-LLaMA, a versatile generalist large language +model (LLM) tailored for the field of radiation oncology. This model seamlessly +covers a wide range of the workflow of radiation oncologists, adept at various +tasks such as clinical report summarization, radiation therapy plan suggestion, +and plan-guided therapy target volume segmentation. In particular, to maximize +the end-to-end performance, we further present a novel Consistency Embedding +Fine-Tuning (CEFTune) technique, which boosts LLM's robustness to additional +errors at the intermediates while preserving the capability of handling clean +inputs, and creatively transform this concept into LLM-driven segmentation +framework as Consistency Embedding Segmentation (CESEG). Experimental results +on multi-centre cohort sets demonstrate our proposed RO-LLaMA's promising +performance for diverse tasks with generalization capabilities. + +
+
+
+
+
+ + ☆ InterControl: Generate Human Motion Interactions by Controlling Every + Joint + + +
+ Text-conditioned human motion generation model has achieved great progress by +introducing diffusion models and corresponding control signals. However, the +interaction between humans are still under explored. To model interactions of +arbitrary number of humans, we define interactions as human joint pairs that +are either in contact or separated, and leverage {\em Large Language Model +(LLM) Planner} to translate interaction descriptions into contact plans. Based +on the contact plans, interaction generation could be achieved by spatially +controllable motion generation methods by taking joint contacts as spatial +conditions. We present a novel approach named InterControl for flexible spatial +control of every joint in every person at any time by leveraging motion +diffusion model only trained on single-person data. We incorporate a motion +controlnet to generate coherent and realistic motions given sparse spatial +control signals and a loss guidance module to precisely align any joint to the +desired position in a classifier guidance manner via Inverse Kinematics (IK). +Extensive experiments on HumanML3D and KIT-ML dataset demonstrate its +effectiveness in versatile joint control. We also collect data of joint contact +pairs by LLMs to show InterControl's ability in human interaction generation. + +
+
+ comment: Generate human interactions with only single-person motion diffusion + model via LLM generated joint contact pairs, code + https://github.com/zhenzhiwang/intercontrol +
+
+
+
+
+ + ☆ JSSL: Joint Supervised and Self-supervised Learning for MRI + Reconstruction + + +
+ Magnetic Resonance Imaging represents an important diagnostic modality; +however, its inherently slow acquisition process poses challenges in obtaining +fully sampled k-space data under motion in clinical scenarios such as +abdominal, cardiac, and prostate imaging. In the absence of fully sampled +acquisitions, which can serve as ground truth data, training deep learning +algorithms in a supervised manner to predict the underlying ground truth image +becomes an impossible task. To address this limitation, self-supervised methods +have emerged as a viable alternative, leveraging available subsampled k-space +data to train deep learning networks for MRI reconstruction. Nevertheless, +these self-supervised approaches often fall short when compared to supervised +methodologies. In this paper, we introduce JSSL (Joint Supervised and +Self-supervised Learning), a novel training approach for deep learning-based +MRI reconstruction algorithms aimed at enhancing reconstruction quality in +scenarios where target dataset(s) containing fully sampled k-space measurements +are unavailable. Our proposed method operates by simultaneously training a +model in a self-supervised learning setting, using subsampled data from the +target dataset(s), and in a supervised learning manner, utilizing data from +other datasets, referred to as proxy datasets, where fully sampled k-space data +is accessible. To demonstrate the efficacy of JSSL, we utilized subsampled +prostate parallel MRI measurements as the target dataset, while employing fully +sampled brain and knee k-space acquisitions as proxy datasets. Our results +showcase a substantial improvement over conventional self-supervised training +methods, thereby underscoring the effectiveness of our joint approach. We +provide a theoretical motivation for JSSL and establish a practical +"rule-of-thumb" for selecting the most appropriate training approach for deep +MRI reconstruction. + +
+
+ comment: 26 pages, 11 figures, 6 tables +
+
+
+
+
+ + ☆ SiTH: Single-view Textured Human Reconstruction with Image-Conditioned + Diffusion + + +
+ A long-standing goal of 3D human reconstruction is to create lifelike and +fully detailed 3D humans from single images. The main challenge lies in +inferring unknown human shapes, clothing, and texture information in areas not +visible in the images. To address this, we propose SiTH, a novel pipeline that +uniquely integrates an image-conditioned diffusion model into a 3D mesh +reconstruction workflow. At the core of our method lies the decomposition of +the ill-posed single-view reconstruction problem into hallucination and +reconstruction subproblems. For the former, we employ a powerful generative +diffusion model to hallucinate back appearances from the input images. For the +latter, we leverage skinned body meshes as guidance to recover full-body +texture meshes from the input and back-view images. Our designs enable training +of the pipeline with only about 500 3D human scans while maintaining its +generality and robustness. Extensive experiments and user studies on two 3D +reconstruction benchmarks demonstrated the efficacy of our method in generating +realistic, fully textured 3D humans from a diverse range of unseen images. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Single-Model and Any-Modality for Video Object Tracking + + +
+ In the realm of video object tracking, auxiliary modalities such as depth, +thermal, or event data have emerged as valuable assets to complement the RGB +trackers. In practice, most existing RGB trackers learn a single set of +parameters to use them across datasets and applications. However, a similar +single-model unification for multi-modality tracking presents several +challenges. These challenges stem from the inherent heterogeneity of inputs -- +each with modality-specific representations, the scarcity of multi-modal +datasets, and the absence of all the modalities at all times. In this work, we +introduce Un-Track, a \underline{Un}ified Tracker of a single set of parameters +for any modality. To handle any modality, our method learns their common latent +space through low-rank factorization and reconstruction techniques. More +importantly, we use only the RGB-X pairs to learn the common latent space. This +unique shared representation seamlessly binds all modalities together, enabling +effective unification and accommodating any missing modality, all within a +single transformer-based architecture and without the need for +modality-specific fine-tuning. Our Un-Track achieves +8.1 absolute F-score +gain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50) GFLOPs +with +6.6M (over 93M) parameters, through a simple yet efficient prompting +strategy. Extensive comparisons on five benchmark datasets with different +modalities show that Un-Track surpasses both SOTA unified trackers and +modality-specific finetuned counterparts, validating our effectiveness and +practicality. + +
+
+
+
+
+ + ☆ Cell Maps Representation For Lung Adenocarcinoma Growth Patterns + Classification In Whole Slide Images + + +
+ Lung adenocarcinoma is a morphologically heterogeneous disease, characterized +by five primary histologic growth patterns. The quantity of these patterns can +be related to tumor behavior and has a significant impact on patient prognosis. +In this work, we propose a novel machine learning pipeline capable of +classifying tissue tiles into one of the five patterns or as non-tumor, with an +Area Under the Receiver Operating Characteristic Curve (AUCROC) score of 0.97. +Our model's strength lies in its comprehensive consideration of cellular +spatial patterns, where it first generates cell maps from Hematoxylin and Eosin +(H&E) whole slide images (WSIs), which are then fed into a convolutional neural +network classification model. Exploiting these cell maps provides the model +with robust generalizability to new data, achieving approximately 30% higher +accuracy on unseen test-sets compared to current state of the art approaches. +The insights derived from our model can be used to predict prognosis, enhancing +patient outcomes. + +
+
+
+
+
+ + ☆ Learning with Noisy Low-Cost MOS for Image Quality Assessment via + Dual-Bias Calibration + + +
+ Learning based image quality assessment (IQA) models have obtained impressive +performance with the help of reliable subjective quality labels, where mean +opinion score (MOS) is the most popular choice. However, in view of the +subjective bias of individual annotators, the labor-abundant MOS (LA-MOS) +typically requires a large collection of opinion scores from multiple +annotators for each image, which significantly increases the learning cost. In +this paper, we aim to learn robust IQA models from low-cost MOS (LC-MOS), which +only requires very few opinion scores or even a single opinion score for each +image. More specifically, we consider the LC-MOS as the noisy observation of +LA-MOS and enforce the IQA model learned from LC-MOS to approach the unbiased +estimation of LA-MOS. In this way, we represent the subjective bias between +LC-MOS and LA-MOS, and the model bias between IQA predictions learned from +LC-MOS and LA-MOS (i.e., dual-bias) as two latent variables with unknown +parameters. By means of the expectation-maximization based alternating +optimization, we can jointly estimate the parameters of the dual-bias, which +suppresses the misleading of LC-MOS via a gated dual-bias calibration (GDBC) +module. To the best of our knowledge, this is the first exploration of robust +IQA model learning from noisy low-cost labels. Theoretical analysis and +extensive experiments on four popular IQA datasets show that the proposed +method is robust toward different bias rates and annotation numbers and +significantly outperforms the other learning based IQA models when only LC-MOS +is available. Furthermore, we also achieve comparable performance with respect +to the other models learned with LA-MOS. + +
+
+
+
+
+ + ☆ Learning Disentangled Identifiers for Action-Customized Text-to-Image + Generation + + +
+ This study focuses on a novel task in text-to-image (T2I) generation, namely +action customization. The objective of this task is to learn the co-existing +action from limited data and generalize it to unseen humans or even animals. +Experimental results show that existing subject-driven customization methods +fail to learn the representative characteristics of actions and struggle in +decoupling actions from context features, including appearance. To overcome the +preference for low-level features and the entanglement of high-level features, +we propose an inversion-based method Action-Disentangled Identifier (ADI) to +learn action-specific identifiers from the exemplar images. ADI first expands +the semantic conditioning space by introducing layer-wise identifier tokens, +thereby increasing the representational richness while distributing the +inversion across different features. Then, to block the inversion of +action-agnostic features, ADI extracts the gradient invariance from the +constructed sample triples and masks the updates of irrelevant channels. To +comprehensively evaluate the task, we present an ActionBench that includes a +variety of actions, each accompanied by meticulously selected samples. Both +quantitative and qualitative results show that our ADI outperforms existing +baselines in action-customized T2I generation. + +
+
+
+
+
+ + ☆ Syn3DWound: A Synthetic Dataset for 3D Wound Bed Analysis + + +
+ Wound management poses a significant challenge, particularly for bedridden +patients and the elderly. Accurate diagnostic and healing monitoring can +significantly benefit from modern image analysis, providing accurate and +precise measurements of wounds. Despite several existing techniques, the +shortage of expansive and diverse training datasets remains a significant +obstacle to constructing machine learning-based frameworks. This paper +introduces Syn3DWound, an open-source dataset of high-fidelity simulated wounds +with 2D and 3D annotations. We propose baseline methods and a benchmarking +framework for automated 3D morphometry analysis and 2D/3D wound segmentation. + +
+
+
+
+
+ + ☆ A-JEPA: Joint-Embedding Predictive Architecture Can Listen + + +
+ This paper presents that the masked-modeling principle driving the success of +large foundational vision models can be effectively applied to audio by making +predictions in a latent space. We introduce Audio-based Joint-Embedding +Predictive Architecture (A-JEPA), a simple extension method for self-supervised +learning from the audio spectrum. Following the design of I-JPEA, our A-JEPA +encodes visible audio spectrogram patches with a curriculum masking strategy +via context encoder, and predicts the representations of regions sampled at +well-designed locations. The target representations of those regions are +extracted by the exponential moving average of context encoder, \emph{i.e.}, +target encoder, on the whole spectrogram. We find it beneficial to transfer +random block masking into time-frequency aware masking in a curriculum manner, +considering the complexity of highly correlated in local time and frequency in +audio spectrograms. To enhance contextual semantic understanding and +robustness, we fine-tune the encoder with a regularized masking on target +datasets, instead of input dropping or zero. Empirically, when built with +Vision Transformers structure, we find A-JEPA to be highly scalable and sets +new state-of-the-art performance on multiple audio and speech classification +tasks, outperforming other recent models that use externally supervised +pre-training. + +
+
+
+
+
+ + ☆ FlowZero: Zero-Shot Text-to-Video Synthesis with LLM-Driven Dynamic + Scene Syntax + + +
+ Text-to-video (T2V) generation is a rapidly growing research area that aims +to translate the scenes, objects, and actions within complex video text into a +sequence of coherent visual frames. We present FlowZero, a novel framework that +combines Large Language Models (LLMs) with image diffusion models to generate +temporally-coherent videos. FlowZero uses LLMs to understand complex +spatio-temporal dynamics from text, where LLMs can generate a comprehensive +dynamic scene syntax (DSS) containing scene descriptions, object layouts, and +background motion patterns. These elements in DSS are then used to guide the +image diffusion model for video generation with smooth object motions and +frame-to-frame coherence. Moreover, FlowZero incorporates an iterative +self-refinement process, enhancing the alignment between the spatio-temporal +layouts and the textual prompts for the videos. To enhance global coherence, we +propose enriching the initial noise of each frame with motion dynamics to +control the background movement and camera motion adaptively. By using +spatio-temporal syntaxes to guide the diffusion process, FlowZero achieves +improvement in zero-shot video synthesis, generating coherent videos with vivid +motion. + +
+
+ comment: Project page: https://flowzero-video.github.io +
+
+
+
+
+ + ☆ C-SAW: Self-Supervised Prompt Learning for Image Generalization in + Remote Sensing + + +
+ We focus on domain and class generalization problems in analyzing optical +remote sensing images, using the large-scale pre-trained vision-language model +(VLM), CLIP. While contrastively trained VLMs show impressive zero-shot +generalization performance, their effectiveness is limited when dealing with +diverse domains during training and testing. Existing prompt learning +techniques overlook the importance of incorporating domain and content +information into the prompts, which results in a drop in performance while +dealing with such multi-domain data. To address these challenges, we propose a +solution that ensures domain-invariant prompt learning while enhancing the +expressiveness of visual features. We observe that CLIP's vision encoder +struggles to identify contextual image information, particularly when image +patches are jumbled up. This issue is especially severe in optical remote +sensing images, where land-cover classes exhibit well-defined contextual +appearances. To this end, we introduce C-SAW, a method that complements CLIP +with a self-supervised loss in the visual space and a novel prompt learning +technique that emphasizes both visual domain and content-specific features. We +keep the CLIP backbone frozen and introduce a small set of projectors for both +the CLIP encoders to train C-SAW contrastively. Experimental results +demonstrate the superiority of C-SAW across multiple remote sensing benchmarks +and different generalization tasks. + +
+
+ comment: Accepted in ACM ICVGIP 2023 +
+
+
+
+
+ + ☆ PIPE : Parallelized Inference Through Post-Training Quantization + Ensembling of Residual Expansions + + +
+ Deep neural networks (DNNs) are ubiquitous in computer vision and natural +language processing, but suffer from high inference cost. This problem can be +addressed by quantization, which consists in converting floating point +perations into a lower bit-width format. With the growing concerns on privacy +rights, we focus our efforts on data-free methods. However, such techniques +suffer from their lack of adaptability to the target devices, as a hardware +typically only support specific bit widths. Thus, to adapt to a variety of +devices, a quantization method shall be flexible enough to find good accuracy +v.s. speed trade-offs for every bit width and target device. To achieve this, +we propose PIPE, a quantization method that leverages residual error expansion, +along with group sparsity and an ensemble approximation for better +parallelization. PIPE is backed off by strong theoretical guarantees and +achieves superior performance on every benchmarked application (from vision to +NLP tasks), architecture (ConvNets, transformers) and bit-width (from int8 to +ternary quantization). + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2203.14645 +
+
+
+
+
+ + ☆ SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using + Neural Radiance Fields + + +
+ In rapidly-evolving domains such as autonomous driving, the use of multiple +sensors with different modalities is crucial to ensure high operational +precision and stability. To correctly exploit the provided information by each +sensor in a single common frame, it is essential for these sensors to be +accurately calibrated. In this paper, we leverage the ability of Neural +Radiance Fields (NeRF) to represent different sensors modalities in a common +volumetric representation to achieve robust and accurate spatio-temporal sensor +calibration. By designing a partitioning approach based on the visible part of +the scene for each sensor, we formulate the calibration problem using only the +overlapping areas. This strategy results in a more robust and accurate +calibration that is less prone to failure. We demonstrate that our approach +works on outdoor urban scenes by validating it on multiple established driving +datasets. Results show that our method is able to get better accuracy and +robustness compared to existing methods. + +
+
+ comment: Paper + Supplementary, under review +
+
+
+
+
+ + ☆ Relationship between Model Compression and Adversarial Robustness: A + Review of Current Evidence SC + + +
+ Increasing the model capacity is a known approach to enhance the adversarial +robustness of deep learning networks. On the other hand, various model +compression techniques, including pruning and quantization, can reduce the size +of the network while preserving its accuracy. Several recent studies have +addressed the relationship between model compression and adversarial +robustness, while some experiments have reported contradictory results. This +work summarizes available evidence and discusses possible explanations for the +observed effects. + +
+
+ comment: Accepted for publication at SSCI 2023 +
+
+
+
+
+ + ☆ Stable Segment Anything Model + + +
+ The Segment Anything Model (SAM) achieves remarkable promptable segmentation +given high-quality prompts which, however, often require good skills to +specify. To make SAM robust to casual prompts, this paper presents the first +comprehensive analysis on SAM's segmentation stability across a diverse +spectrum of prompt qualities, notably imprecise bounding boxes and insufficient +points. Our key finding reveals that given such low-quality prompts, SAM's mask +decoder tends to activate image features that are biased towards the background +or confined to specific object parts. To mitigate this issue, our key idea +consists of adjusting the sampling locations of image feature using learnable +deformable offsets, while the original SAM model architecture and weights +remain unchanged. Consequently, our deformable sampling plugin (DSP) enables +SAM to adaptively shift attention to the prompted target regions in a +data-driven manner, facilitated by our effective robust training strategy +(RTS). During inference, dynamic routing plugin (DRP) is proposed that toggles +SAM between the deformable and regular grid sampling modes, conditioned on the +input prompt quality. Thus, our solution, termed Stable-SAM, is one of its kind +focusing on solely adjusting feature sampling locations, which offers several +advantages: 1) improved SAM's segmentation stability across a wide range of +prompt qualities, while 2) retaining SAM's powerful promptable segmentation +efficiency and generality, with 3) minimal learnable parameters (0.08 M) and +fast adaptation (by 1 training epoch). Extensive experiments across multiple +datasets validate the effectiveness and advantages of our approach, +underscoring Stable-SAM as a more robust solution for segmenting anything. +Codes will be released upon acceptance. + +
+
+ comment: Codes will be released upon acceptance +
+
+
+
+
+ + ☆ Check, Locate, Rectify: A Training-Free Layout Calibration System for + Text-to-Image Generation + + +
+ Diffusion models have recently achieved remarkable progress in generating +realistic images. However, challenges remain in accurately understanding and +synthesizing the layout requirements in the textual prompts. To align the +generated image with layout instructions, we present a training-free layout +calibration system SimM that intervenes in the generative process on the fly +during inference time. Specifically, following a "check-locate-rectify" +pipeline, the system first analyses the prompt to generate the target layout +and compares it with the intermediate outputs to automatically detect errors. +Then, by moving the located activations and making intra- and inter-map +adjustments, the rectification process can be performed with negligible +computational overhead. To evaluate SimM over a range of layout requirements, +we present a benchmark SimMBench that compensates for the lack of superlative +spatial relations in existing datasets. And both quantitative and qualitative +results demonstrate the effectiveness of the proposed SimM in calibrating the +layout inconsistencies. + +
+
+
+
+
+ + ☆ Side4Video: Spatial-Temporal Side Network for Memory-Efficient + Image-to-Video Transfer Learning + + +
+ Large pre-trained vision models achieve impressive success in computer +vision. However, fully fine-tuning large models for downstream tasks, +particularly in video understanding, can be prohibitively computationally +expensive. Recent studies turn their focus towards efficient image-to-video +transfer learning. Nevertheless, existing efficient fine-tuning methods lack +attention to training memory usage and exploration of transferring a larger +model to the video domain. In this paper, we present a novel Spatial-Temporal +Side Network for memory-efficient fine-tuning large image models to video +understanding, named Side4Video. Specifically, we introduce a lightweight +spatial-temporal side network attached to the frozen vision model, which avoids +the backpropagation through the heavy pre-trained model and utilizes +multi-level spatial features from the original image model. Extremely +memory-efficient architecture enables our method to reduce 75% memory usage +than previous adapter-based methods. In this way, we can transfer a huge ViT-E +(4.4B) for video understanding tasks which is 14x larger than ViT-L (304M). Our +approach achieves remarkable performance on various video datasets across +unimodal and cross-modal tasks (i.e., action recognition and text-video +retrieval), especially in Something-Something V1&V2 (67.3% & 74.6%), +Kinetics-400 (88.6%), MSR-VTT (52.3%), MSVD (56.1%) and VATEX (68.8%). We +release our code at https://github.com/HJYao00/Side4Video. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Towards Vision Enhancing LLMs: Empowering Multimodal Knowledge Storage + and Sharing in LLMs + + +
+ Recent advancements in multimodal large language models (MLLMs) have achieved +significant multimodal generation capabilities, akin to GPT-4. These models +predominantly map visual information into language representation space, +leveraging the vast knowledge and powerful text generation abilities of LLMs to +produce multimodal instruction-following responses. We could term this method +as LLMs for Vision because of its employing LLMs for visual-language +understanding, yet observe that these MLLMs neglect the potential of harnessing +visual knowledge to enhance overall capabilities of LLMs, which could be +regraded as Vision Enhancing LLMs. In this paper, we propose an approach called +MKS2, aimed at enhancing LLMs through empowering Multimodal Knowledge Storage +and Sharing in LLMs. Specifically, we introduce the Modular Visual Memory, a +component integrated into the internal blocks of LLMs, designed to store +open-world visual information efficiently. Additionally, we present a soft +Mixtures-of-Multimodal Experts architecture in LLMs to invoke multimodal +knowledge collaboration during generation. Our comprehensive experiments +demonstrate that MKS2 substantially augments the reasoning capabilities of LLMs +in contexts necessitating physical or commonsense knowledge. It also delivers +competitive results on multimodal benchmarks. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ PyNanospacing: TEM image processing tool for strain analysis and + visualization + + +
+ The diverse spectrum of material characteristics including band gap, +mechanical moduli, color, phonon and electronic density of states, along with +catalytic and surface properties are intricately intertwined with the atomic +structure and the corresponding interatomic bond-lengths. This interconnection +extends to the manifestation of interplanar spacings within a crystalline +lattice. Analysis of these interplanar spacings and the comprehension of any +deviations, whether it be lattice compression or expansion, commonly referred +to as strain, hold paramount significance in unraveling various unknowns within +the field. Transmission Electron Microscopy (TEM) is widely used to capture +atomic-scale ordering, facilitating direct investigation of interplanar +spacings. However, creating critical contour maps for visualizing and +interpreting lattice stresses in TEM images remains a challenging task. Here we +developed a Python code for TEM image processing that can handle a wide range +of materials including nanoparticles, 2D materials, pure crystals and solid +solutions. This algorithm converts local differences in interplanar spacings +into contour maps allowing for a visual representation of lattice expansion and +compression. The tool is very generic and can significantly aid in analyzing +material properties using TEM images, allowing for a more in-depth exploration +of the underlying science behind strain engineering via strain contour maps at +the atomic level. + +
+
+ comment: Preprint, 13 pages, 9 figures +
+
+
+
+
+ + ☆ One More Step: A Versatile Plug-and-Play Module for Rectifying Diffusion + Schedule Flaws and Enhancing Low-Frequency Controls + + +
+ It is well known that many open-released foundational diffusion models have +difficulty in generating images that substantially depart from average +brightness, despite such images being present in the training data. This is due +to an inconsistency: while denoising starts from pure Gaussian noise during +inference, the training noise schedule retains residual data even in the final +timestep distribution, due to difficulties in numerical conditioning in +mainstream formulation, leading to unintended bias during inference. To +mitigate this issue, certain $\epsilon$-prediction models are combined with an +ad-hoc offset-noise methodology. In parallel, some contemporary models have +adopted zero-terminal SNR noise schedules together with +$\mathbf{v}$-prediction, which necessitate major alterations to pre-trained +models. However, such changes risk destabilizing a large multitude of +community-driven applications anchored on these pre-trained models. In light of +this, our investigation revisits the fundamental causes, leading to our +proposal of an innovative and principled remedy, called One More Step (OMS). By +integrating a compact network and incorporating an additional simple yet +effective step during inference, OMS elevates image fidelity and harmonizes the +dichotomy between training and inference, while preserving original model +parameters. Once trained, various pre-trained diffusion models with the same +latent domain can share the same OMS module. + +
+
+ comment: Project Page: https://jabir-zheng.github.io/OneMoreStep/, Demo Page: + https://huggingface.co/spaces/h1t/oms_sdxl_lcm +
+
+
+
+
+ + ☆ Machine Learning-Based Jamun Leaf Disease Detection: A Comprehensive + Review + + +
+ Jamun leaf diseases pose a significant threat to agricultural productivity, +negatively impacting both yield and quality in the jamun industry. The advent +of machine learning has opened up new avenues for tackling these diseases +effectively. Early detection and diagnosis are essential for successful crop +management. While no automated systems have yet been developed specifically for +jamun leaf disease detection, various automated systems have been implemented +for similar types of disease detection using image processing techniques. This +paper presents a comprehensive review of machine learning methodologies +employed for diagnosing plant leaf diseases through image classification, which +can be adapted for jamun leaf disease detection. It meticulously assesses the +strengths and limitations of various Vision Transformer models, including +Transfer learning model and vision transformer (TLMViT), SLViT, SE-ViT, +IterationViT, Tiny-LeViT, IEM-ViT, GreenViT, and PMViT. Additionally, the paper +reviews models such as Dense Convolutional Network (DenseNet), Residual Neural +Network (ResNet)-50V2, EfficientNet, Ensemble model, Convolutional Neural +Network (CNN), and Locally Reversible Transformer. These machine-learning +models have been evaluated on various datasets, demonstrating their real-world +applicability. This review not only sheds light on current advancements in the +field but also provides valuable insights for future research directions in +machine learning-based jamun leaf disease detection and classification. + +
+
+
+
+
+ + ☆ Optimization of Image Processing Algorithms for Character Recognition in + Cultural Typewritten Documents + + +
+ Linked Data is used in various fields as a new way of structuring and +connecting data. Cultural heritage institutions have been using linked data to +improve archival descriptions and facilitate the discovery of information. Most +archival records have digital representations of physical artifacts in the form +of scanned images that are non-machine-readable. Optical Character Recognition +(OCR) recognizes text in images and translates it into machine-encoded text. +This paper evaluates the impact of image processing methods and parameter +tuning in OCR applied to typewritten cultural heritage documents. The approach +uses a multi-objective problem formulation to minimize Levenshtein edit +distance and maximize the number of words correctly identified with a +non-dominated sorting genetic algorithm (NSGA-II) to tune the methods' +parameters. Evaluation results show that parameterization by digital +representation typology benefits the performance of image pre-processing +algorithms in OCR. Furthermore, our findings suggest that employing image +pre-processing algorithms in OCR might be more suitable for typologies where +the text recognition task without pre-processing does not produce good results. +In particular, Adaptive Thresholding, Bilateral Filter, and Opening are the +best-performing algorithms for the theatre plays' covers, letters, and overall +dataset, respectively, and should be applied before OCR to improve its +performance. + +
+
+ comment: 25 pages, 4 figures +
+
+
+
+
+ + ☆ GPT4Vis: What Can GPT-4 Do for Zero-shot Visual Recognition? + + +
+ This paper does not present a novel method. Instead, it delves into an +essential, yet must-know baseline in light of the latest advancements in +Generative Artificial Intelligence (GenAI): the utilization of GPT-4 for visual +understanding. Our study centers on the evaluation of GPT-4's linguistic and +visual capabilities in zero-shot visual recognition tasks. Specifically, we +explore the potential of its generated rich textual descriptions across various +categories to enhance recognition performance without any training. +Additionally, we evaluate its visual proficiency in directly recognizing +diverse visual content. To achieve this, we conduct an extensive series of +experiments, systematically quantifying the performance of GPT-4 across three +modalities: images, videos, and point clouds. This comprehensive evaluation +encompasses a total of 16 widely recognized benchmark datasets, providing top-1 +and top-5 accuracy metrics. Our study reveals that leveraging GPT-4's advanced +linguistic knowledge to generate rich descriptions markedly improves zero-shot +recognition. In terms of visual proficiency, GPT-4V's average performance +across 16 datasets sits roughly between the capabilities of OpenAI-CLIP's ViT-L +and EVA-CLIP's ViT-E. We hope that this research will contribute valuable data +points and experience for future studies. We release our code at +https://github.com/whwu95/GPT4Vis. + +
+
+ comment: Technical report. Work in progress +
+
+
+
+
+ + ☆ Adinkra Symbol Recognition using Classical Machine Learning and Deep + Learning + + +
+ Artificial intelligence (AI) has emerged as a transformative influence, +engendering paradigm shifts in global societies, spanning academia and +industry. However, in light of these rapid advances, addressing the +underrepresentation of black communities and African countries in AI is +crucial. Boosting enthusiasm for AI can be effectively accomplished by +showcasing straightforward applications around tasks like identifying and +categorizing traditional symbols, such as Adinkra symbols, or familiar objects +within the community. In this research endeavor, we dived into classical +machine learning and harnessed the power of deep learning models to tackle the +intricate task of classifying and recognizing Adinkra symbols. The idea led to +a newly constructed ADINKRA dataset comprising 174,338 images meticulously +organized into 62 distinct classes, each representing a singular and emblematic +symbol. We constructed a CNN model for classification and recognition using six +convolutional layers, three fully connected (FC) layers, and optional dropout +regularization. The model is a simpler and smaller version of VGG, with fewer +layers, smaller channel sizes, and a fixed kernel size. Additionally, we tap +into the transfer learning capabilities provided by pre-trained models like VGG +and ResNet. These models assist us in both classifying images and extracting +features that can be used with classical machine learning models. We assess the +model's performance by measuring its accuracy and convergence rate and +visualizing the areas that significantly influence its predictions. These +evaluations serve as a foundational benchmark for future assessments of the +ADINKRA dataset. We hope this application exemplar inspires ideas on the +various uses of AI in organizing our traditional and modern lives. + +
+
+ comment: 15 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ MARIS: Referring Image Segmentation via Mutual-Aware Attention Features + + +
+ Referring image segmentation (RIS) aims to segment a particular region based +on a language expression prompt. Existing methods incorporate linguistic +features into visual features and obtain multi-modal features for mask +decoding. However, these methods may segment the visually salient entity +instead of the correct referring region, as the multi-modal features are +dominated by the abundant visual context. In this paper, we propose MARIS, a +referring image segmentation method that leverages the Segment Anything Model +(SAM) and introduces a mutual-aware attention mechanism to enhance the +cross-modal fusion via two parallel branches. Specifically, our mutual-aware +attention mechanism consists of Vision-Guided Attention and Language-Guided +Attention, which bidirectionally model the relationship between visual and +linguistic features. Correspondingly, we design a Mask Decoder to enable +explicit linguistic guidance for more consistent segmentation with the language +expression. To this end, a multi-modal query token is proposed to integrate +linguistic information and interact with visual information simultaneously. +Extensive experiments on three benchmark datasets show that our method +outperforms the state-of-the-art RIS methods. Our code will be publicly +available. + +
+
+
+
+
+ + ☆ GLIME: General, Stable and Local LIME Explanation NeurIPS 2023 + + +
+ As black-box machine learning models grow in complexity and find applications +in high-stakes scenarios, it is imperative to provide explanations for their +predictions. Although Local Interpretable Model-agnostic Explanations (LIME) +[22] is a widely adpoted method for understanding model behaviors, it is +unstable with respect to random seeds [35,24,3] and exhibits low local fidelity +(i.e., how well the explanation approximates the model's local behaviors) +[21,16]. Our study shows that this instability problem stems from small sample +weights, leading to the dominance of regularization and slow convergence. +Additionally, LIME's sampling neighborhood is non-local and biased towards the +reference, resulting in poor local fidelity and sensitivity to reference +choice. To tackle these challenges, we introduce GLIME, an enhanced framework +extending LIME and unifying several prior methods. Within the GLIME framework, +we derive an equivalent formulation of LIME that achieves significantly faster +convergence and improved stability. By employing a local and unbiased sampling +distribution, GLIME generates explanations with higher local fidelity compared +to LIME. GLIME explanations are independent of reference choice. Moreover, +GLIME offers users the flexibility to choose a sampling distribution based on +their specific scenarios. + +
+
+ comment: Accepted by NeurIPS 2023 as a Spotlight paper +
+
+
+
+
+ + ☆ Variational Autoencoders for Feature Exploration and Malignancy + Prediction of Lung Lesions BMVC 2023 + + +
+ Lung cancer is responsible for 21% of cancer deaths in the UK and five-year +survival rates are heavily influenced by the stage the cancer was identified +at. Recent studies have demonstrated the capability of AI methods for accurate +and early diagnosis of lung cancer from routine scans. However, this evidence +has not translated into clinical practice with one barrier being a lack of +interpretable models. This study investigates the application Variational +Autoencoders (VAEs), a type of generative AI model, to lung cancer lesions. +Proposed models were trained on lesions extracted from 3D CT scans in the +LIDC-IDRI public dataset. Latent vector representations of 2D slices produced +by the VAEs were explored through clustering to justify their quality and used +in an MLP classifier model for lung cancer diagnosis, the best model achieved +state-of-the-art metrics of AUC 0.98 and 93.1% accuracy. Cluster analysis shows +the VAE latent space separates the dataset of malignant and benign lesions +based on meaningful feature components including tumour size, shape, patient +and malignancy class. We also include a comparative analysis of the standard +Gaussian VAE (GVAE) and the more recent Dirichlet VAE (DirVAE), which replaces +the prior with a Dirichlet distribution to encourage a more explainable latent +space with disentangled feature representation. Finally, we demonstrate the +potential for latent space traversals corresponding to clinically meaningful +feature changes. + +
+
+ comment: 10 pages (main paper), 5 pages (references), 5 figures, 2 tables, + work accepted for BMVC 2023 +
+
+
+
+
+ + ☆ SAM-6D: Segment Anything Model Meets Zero-Shot 6D Object Pose Estimation + + +
+ Zero-shot 6D object pose estimation involves the detection of novel objects +with their 6D poses in cluttered scenes, presenting significant challenges for +model generalizability. Fortunately, the recent Segment Anything Model (SAM) +has showcased remarkable zero-shot transfer performance, which provides a +promising solution to tackle this task. Motivated by this, we introduce SAM-6D, +a novel framework designed to realize the task through two steps, including +instance segmentation and pose estimation. Given the target objects, SAM-6D +employs two dedicated sub-networks, namely Instance Segmentation Model (ISM) +and Pose Estimation Model (PEM), to perform these steps on cluttered RGB-D +images. ISM takes SAM as an advanced starting point to generate all possible +object proposals and selectively preserves valid ones through meticulously +crafted object matching scores in terms of semantics, appearance and geometry. +By treating pose estimation as a partial-to-partial point matching problem, PEM +performs a two-stage point matching process featuring a novel design of +background tokens to construct dense 3D-3D correspondence, ultimately yielding +the pose estimates. Without bells and whistles, SAM-6D outperforms the existing +methods on the seven core datasets of the BOP Benchmark for both instance +segmentation and pose estimation of novel objects. + +
+
+ comment: Github Page: https://github.com/JiehongLin/SAM-6D +
+
+
+
+
+ + ☆ Model-agnostic Body Part Relevance Assessment for Pedestrian Detection + + +
+ Model-agnostic explanation methods for deep learning models are flexible +regarding usability and availability. However, due to the fact that they can +only manipulate input to see changes in output, they suffer from weak +performance when used with complex model architectures. For models with large +inputs as, for instance, in object detection, sampling-based methods like +KernelSHAP are inefficient due to many computation-heavy forward passes through +the model. In this work, we present a framework for using sampling-based +explanation models in a computer vision context by body part relevance +assessment for pedestrian detection. Furthermore, we introduce a novel +sampling-based method similar to KernelSHAP that shows more robustness for +lower sampling sizes and, thus, is more efficient for explainability analyses +on large-scale datasets. + +
+
+
+
+
+ + ☆ HAVE-FUN: Human Avatar Reconstruction from Few-Shot Unconstrained Images + + +
+ As for human avatar reconstruction, contemporary techniques commonly +necessitate the acquisition of costly data and struggle to achieve satisfactory +results from a small number of casual images. In this paper, we investigate +this task from a few-shot unconstrained photo album. The reconstruction of +human avatars from such data sources is challenging because of limited data +amount and dynamic articulated poses. For handling dynamic data, we integrate a +skinning mechanism with deep marching tetrahedra (DMTet) to form a drivable +tetrahedral representation, which drives arbitrary mesh topologies generated by +the DMTet for the adaptation of unconstrained images. To effectively mine +instructive information from few-shot data, we devise a two-phase optimization +method with few-shot reference and few-shot guidance. The former focuses on +aligning avatar identity with reference images, while the latter aims to +generate plausible appearances for unseen regions. Overall, our framework, +called HaveFun, can undertake avatar reconstruction, rendering, and animation. +Extensive experiments on our developed benchmarks demonstrate that HaveFun +exhibits substantially superior performance in reconstructing the human body +and hand. Project website: https://seanchenxy.github.io/HaveFunWeb/. + +
+
+
+
+
+ + ☆ Deformation-Guided Unsupervised Non-Rigid Shape Matching + + +
+ We present an unsupervised data-driven approach for non-rigid shape matching. +Shape matching identifies correspondences between two shapes and is a +fundamental step in many computer vision and graphics applications. Our +approach is designed to be particularly robust when matching shapes digitized +using 3D scanners that contain fine geometric detail and suffer from different +types of noise including topological noise caused by the coalescence of +spatially close surface regions. We build on two strategies. First, using a +hierarchical patch based shape representation we match shapes consistently in a +coarse to fine manner, allowing for robustness to noise. This multi-scale +representation drastically reduces the dimensionality of the problem when +matching at the coarsest scale, rendering unsupervised learning feasible. +Second, we constrain this hierarchical matching to be reflected in 3D by +fitting a patch-wise near-rigid deformation model. Using this constraint, we +leverage spatial continuity at different scales to capture global shape +properties, resulting in matchings that generalize well to data with different +deformations and noise characteristics. Experiments demonstrate that our +approach obtains significantly better results on raw 3D scans than +state-of-the-art methods, while performing on-par on standard test scenarios. + +
+
+
+
+
+ + ☆ Technical Report for Argoverse Challenges on 4D Occupancy Forecasting + + +
+ This report presents our Le3DE2E_Occ solution for 4D Occupancy Forecasting in +Argoverse Challenges at CVPR 2023 Workshop on Autonomous Driving (WAD). Our +solution consists of a strong LiDAR-based Bird's Eye View (BEV) encoder with +temporal fusion and a two-stage decoder, which combines a DETR head and a UNet +decoder. The solution was tested on the Argoverse 2 sensor dataset to evaluate +the occupancy state 3 seconds in the future. Our solution achieved 18% lower L1 +Error (3.57) than the baseline and got the 1 place on the 4D Occupancy +Forecasting task in Argoverse Challenges at CVPR 2023. + +
+
+
+
+
+ + ☆ Regularization by Texts for Latent Diffusion Inverse Solvers + + +
+ The recent advent of diffusion models has led to significant progress in +solving inverse problems, leveraging these models as effective generative +priors. Nonetheless, challenges related to the ill-posed nature of such +problems remain, often due to inherent ambiguities in measurements. Drawing +inspiration from the human ability to resolve visual ambiguities through +perceptual biases, here we introduce a novel latent diffusion inverse solver by +incorporating regularization by texts (TReg). Specifically, TReg applies the +textual description of the preconception of the solution during the reverse +sampling phase, of which description isndynamically reinforced through +null-text optimization for adaptive negation. Our comprehensive experimental +results demonstrate that TReg successfully mitigates ambiguity in latent +diffusion inverse solvers, enhancing their effectiveness and accuracy. + +
+
+
+
+
+ + ☆ Enhancing Diffusion Models with Text-Encoder Reinforcement Learning + + +
+ Text-to-image diffusion models are typically trained to optimize the +log-likelihood objective, which presents challenges in meeting specific +requirements for downstream tasks, such as image aesthetics and image-text +alignment. Recent research addresses this issue by refining the diffusion U-Net +using human rewards through reinforcement learning or direct backpropagation. +However, many of them overlook the importance of the text encoder, which is +typically pretrained and fixed during training. In this paper, we demonstrate +that by finetuning the text encoder through reinforcement learning, we can +enhance the text-image alignment of the results, thereby improving the visual +quality. Our primary motivation comes from the observation that the current +text encoder is suboptimal, often requiring careful prompt adjustment. While +fine-tuning the U-Net can partially improve performance, it remains suffering +from the suboptimal text encoder. Therefore, we propose to use reinforcement +learning with low-rank adaptation to finetune the text encoder based on +task-specific rewards, referred as \textbf{TexForce}. We first show that +finetuning the text encoder can improve the performance of diffusion models. +Then, we illustrate that TexForce can be simply combined with existing U-Net +finetuned models to get much better results without additional training. +Finally, we showcase the adaptability of our method in diverse applications, +including the generation of high-quality face and hand images. + +
+
+
+
+
+ + ☆ Reinforcement Learning from Diffusion Feedback: Q* for Image Search + + +
+ Large vision-language models are steadily gaining personalization +capabilities at the cost of fine-tuning or data augmentation. We present two +models for image generation using model-agnostic learning that align semantic +priors with generative capabilities. RLDF, or Reinforcement Learning from +Diffusion Feedback, is a singular approach for visual imitation through +prior-preserving reward function guidance. This employs Q-learning (with +standard Q*) for generation and follows a semantic-rewarded trajectory for +image search through finite encoding-tailored actions. The second proposed +method, noisy diffusion gradient, is optimization driven. At the root of both +methods is a special CFG encoding that we propose for continual semantic +guidance. Using only a single input image and no text input, RLDF generates +high-quality images over varied domains including retail, sports and +agriculture showcasing class-consistency and strong visual diversity. Project +website is available at https://infernolia.github.io/RLDF. + +
+
+
+
+
+ + ☆ PaintNeSF: Artistic Creation of Stylized Scenes with Vectorized 3D + Strokes + + +
+ We present Paint Neural Stroke Field (PaintNeSF), a novel technique to +generate stylized images of a 3D scene at arbitrary novel views from multi-view +2D images. Different from existing methods which apply stylization to trained +neural radiance fields at the voxel level, our approach draws inspiration from +image-to-painting methods, simulating the progressive painting process of human +artwork with vector strokes. We develop a palette of stylized 3D strokes from +basic primitives and splines, and consider the 3D scene stylization task as a +multi-view reconstruction process based on these 3D stroke primitives. Instead +of directly searching for the parameters of these 3D strokes, which would be +too costly, we introduce a differentiable renderer that allows optimizing +stroke parameters using gradient descent, and propose a training scheme to +alleviate the vanishing gradient issue. The extensive evaluation demonstrates +that our approach effectively synthesizes 3D scenes with significant geometric +and aesthetic stylization while maintaining a consistent appearance across +different views. Our method can be further integrated with style loss and +image-text contrastive models to extend its applications, including color +transfer and text-driven 3D scene drawing. + +
+
+
+
+
+ + ☆ Only Positive Cases: 5-fold High-order Attention Interaction Model for + Skin Segmentation Derived Classification + + +
+ Computer-aided diagnosis of skin diseases is an important tool. However, the +interpretability of computer-aided diagnosis is currently poor. Dermatologists +and patients cannot intuitively understand the learning and prediction process +of neural networks, which will lead to a decrease in the credibility of +computer-aided diagnosis. In addition, traditional methods need to be trained +using negative samples in order to predict the presence or absence of a lesion, +but medical data is often in short supply. In this paper, we propose a multiple +high-order attention interaction model (MHA-UNet) for use in a highly +explainable skin lesion segmentation task. MHA-UNet is able to obtain the +presence or absence of a lesion by explainable reasoning without the need for +training on negative samples. Specifically, we propose a high-order attention +interaction mechanism that introduces squeeze attention to a higher level for +feature attention. In addition, a multiple high-order attention interaction +(MHAblock) module is proposed by combining the different features of different +orders. For classifying the presence or absence of lesions, we conducted +classification experiments on several publicly available datasets in the +absence of negative samples, based on explainable reasoning about the +interaction of 5 attention orders of MHAblock. The highest positive detection +rate obtained from the experiments was 81.0% and the highest negative detection +rate was 83.5%. For segmentation experiments, comparison experiments of the +proposed method with 13 medical segmentation models and external validation +experiments with 8 state-of-the-art models in three public datasets and our +clinical dataset demonstrate the state-of-the-art performance of our model. The +code is available from https://github.com/wurenkai/MHA-UNet. + +
+
+
+
+
+ + ☆ Align before Adapt: Leveraging Entity-to-Region Alignments for + Generalizable Video Action Recognition + + +
+ Large-scale visual-language pre-trained models have achieved significant +success in various video tasks. However, most existing methods follow an "adapt +then align" paradigm, which adapts pre-trained image encoders to model +video-level representations and utilizes one-hot or text embedding of the +action labels for supervision. This paradigm overlooks the challenge of mapping +from static images to complicated activity concepts. In this paper, we propose +a novel "Align before Adapt" (ALT) paradigm. Prior to adapting to video +representation learning, we exploit the entity-to-region alignments for each +frame. The alignments are fulfilled by matching the region-aware image +embeddings to an offline-constructed text corpus. With the aligned entities, we +feed their text embeddings to a transformer-based video adapter as the queries, +which can help extract the semantics of the most important entities from a +video to a vector. This paradigm reuses the visual-language alignment of VLP +during adaptation and tries to explain an action by the underlying entities. +This helps understand actions by bridging the gap with complex activity +semantics, particularly when facing unfamiliar or unseen categories. ALT +achieves competitive performance and superior generalizability while requiring +significantly low computational costs. In fully supervised scenarios, it +achieves 88.1% top-1 accuracy on Kinetics-400 with only 4947 GFLOPs. In 2-shot +experiments, ALT outperforms the previous state-of-the-art by 7.1% and 9.2% on +HMDB-51 and UCF-101, respectively. + +
+
+
+
+
+ + ☆ Technical Report for Argoverse Challenges on Unified Sensor-based + Detection, Tracking, and Forecasting + + +
+ This report presents our Le3DE2E solution for unified sensor-based detection, +tracking, and forecasting in Argoverse Challenges at CVPR 2023 Workshop on +Autonomous Driving (WAD). We propose a unified network that incorporates three +tasks, including detection, tracking, and forecasting. This solution adopts a +strong Bird's Eye View (BEV) encoder with spatial and temporal fusion and +generates unified representations for multi-tasks. The solution was tested in +the Argoverse 2 sensor dataset to evaluate the detection, tracking, and +forecasting of 26 object categories. We achieved 1st place in Detection, +Tracking, and Forecasting on the E2E Forecasting track in Argoverse Challenges +at CVPR 2023 WAD. + +
+
+
+
+
+ + ☆ A manometric feature descriptor with linear-SVM to distinguish + esophageal contraction vigor + + +
+ n clinical, if a patient presents with nonmechanical obstructive dysphagia, +esophageal chest pain, and gastro esophageal reflux symptoms, the physician +will usually assess the esophageal dynamic function. High-resolution manometry +(HRM) is a clinically commonly used technique for detection of esophageal +dynamic function comprehensively and objectively. However, after the results of +HRM are obtained, doctors still need to evaluate by a variety of parameters. +This work is burdensome, and the process is complex. We conducted image +processing of HRM to predict the esophageal contraction vigor for assisting the +evaluation of esophageal dynamic function. Firstly, we used Feature-Extraction +and Histogram of Gradients (FE-HOG) to analyses feature of proposal of swallow +(PoS) to further extract higher-order features. Then we determine the +classification of esophageal contraction vigor normal, weak and failed by using +linear-SVM according to these features. Our data set includes 3000 training +sets, 500 validation sets and 411 test sets. After verification our accuracy +reaches 86.83%, which is higher than other common machine learning methods. + +
+
+
+
+
+ + ☆ Spatially Covariant Image Registration with Text Prompts + + +
+ Medical images are often characterized by their structured anatomical +representations and spatially inhomogeneous contrasts. Leveraging anatomical +priors in neural networks can greatly enhance their utility in +resource-constrained clinical settings. Prior research has harnessed such +information for image segmentation, yet progress in deformable image +registration has been modest. Our work introduces textSCF, a novel method that +integrates spatially covariant filters and textual anatomical prompts encoded +by visual-language models, to fill this gap. This approach optimizes an +implicit function that correlates text embeddings of anatomical regions to +filter weights, relaxing the typical translation-invariance constraint of +convolutional operations. TextSCF not only boosts computational efficiency but +can also retain or improve registration accuracy. By capturing the contextual +interplay between anatomical regions, it offers impressive inter-regional +transferability and the ability to preserve structural discontinuities during +registration. TextSCF's performance has been rigorously tested on inter-subject +brain MRI and abdominal CT registration tasks, outperforming existing +state-of-the-art models in the MICCAI Learn2Reg 2021 challenge and leading the +leaderboard. In abdominal registrations, textSCF's larger model variant +improved the Dice score by 11.3% over the second-best model, while its smaller +variant maintained similar accuracy but with an 89.13% reduction in network +parameters and a 98.34\% decrease in computational operations. + +
+
+ comment: 15 pages, 8 figures, 5 tables +
+
+
+
+
+ + ☆ 2D Feature Distillation for Weakly- and Semi-Supervised 3D Semantic + Segmentation WACV 2024 + + +
+ As 3D perception problems grow in popularity and the need for large-scale +labeled datasets for LiDAR semantic segmentation increase, new methods arise +that aim to reduce the necessity for dense annotations by employing +weakly-supervised training. However these methods continue to show weak +boundary estimation and high false negative rates for small objects and distant +sparse regions. We argue that such weaknesses can be compensated by using RGB +images which provide a denser representation of the scene. We propose an +image-guidance network (IGNet) which builds upon the idea of distilling high +level feature information from a domain adapted synthetically trained 2D +semantic segmentation network. We further utilize a one-way contrastive +learning scheme alongside a novel mixing strategy called FOVMix, to combat the +horizontal field-of-view mismatch between the two sensors and enhance the +effects of image guidance. IGNet achieves state-of-the-art results for +weakly-supervised LiDAR semantic segmentation on ScribbleKITTI, boasting up to +98% relative performance to fully supervised training with only 8% labeled +points, while introducing no additional annotation burden or +computational/memory cost during inference. Furthermore, we show that our +contributions also prove effective for semi-supervised training, where IGNet +claims state-of-the-art results on both ScribbleKITTI and SemanticKITTI. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio, + Video, Point Cloud, Time-Series and Image Recognition + + +
+ Large-kernel convolutional neural networks (ConvNets) have recently received +extensive research attention, but there are two unresolved and critical issues +that demand further investigation. 1) The architectures of existing +large-kernel ConvNets largely follow the design principles of conventional +ConvNets or transformers, while the architectural design for large-kernel +ConvNets remains under-addressed. 2) As transformers have dominated multiple +modalities, it remains to be investigated whether ConvNets also have a strong +universal perception ability in domains beyond vision. In this paper, we +contribute from two aspects. 1) We propose four architectural guidelines for +designing large-kernel ConvNets, the core of which is to exploit the essential +characteristics of large kernels that distinguish them from small kernels - +they can see wide without going deep. Following such guidelines, our proposed +large-kernel ConvNet shows leading performance in image recognition. For +example, our models achieve an ImageNet accuracy of 88.0%, ADE20K mIoU of +55.6%, and COCO box AP of 56.4%, demonstrating better performance and higher +speed than a number of recently proposed powerful competitors. 2) We discover +that large kernels are the key to unlocking the exceptional performance of +ConvNets in domains where they were originally not proficient. With certain +modality-related preprocessing approaches, the proposed model achieves +state-of-the-art performance on time-series forecasting and audio recognition +tasks even without modality-specific customization to the architecture. Code +and all the models at https://github.com/AILab-CVC/UniRepLKNet. + +
+
+ comment: Code, all the models and reproducible training scripts at + https://github.com/AILab-CVC/UniRepLKNet +
+
+
+
+
+ + ☆ Can Vision-Language Models Think from a First-Person Perspective? + + +
+ Vision-language models (VLMs) have recently shown promising results in +traditional downstream tasks. Evaluation studies have emerged to assess their +abilities, with the majority focusing on the third-person perspective, and only +a few addressing specific tasks from the first-person perspective. However, the +capability of VLMs to "think" from a first-person perspective, a crucial +attribute for advancing autonomous agents and robotics, remains largely +unexplored. To bridge this research gap, we introduce EgoThink, a novel visual +question-answering benchmark that encompasses six core capabilities with twelve +detailed dimensions. The benchmark is constructed using selected clips from +egocentric videos, with manually annotated question-answer pairs containing +first-person information. To comprehensively assess VLMs, we evaluate eighteen +popular VLMs on EgoThink. Moreover, given the open-ended format of the answers, +we use GPT-4 as the automatic judge to compute single-answer grading. +Experimental results indicate that although GPT-4V leads in numerous +dimensions, all evaluated VLMs still possess considerable potential for +improvement in first-person perspective tasks. Meanwhile, enlarging the number +of trainable parameters has the most significant impact on model performance on +EgoThink. In conclusion, EgoThink serves as a valuable addition to existing +evaluation benchmarks for VLMs, providing an indispensable resource for future +research in the realm of embodied artificial intelligence and robotics. + +
+
+
+
+
+ + ☆ An Ensemble of 2.5D ResUnet Based Models for Segmentation for Kidney and + Masses + + +
+ The automatic segmentation of kidney, kidney tumor and kidney cyst on +Computed Tomography (CT) scans is a challenging task due to the indistinct +lesion boundaries and fuzzy texture. Considering the large range and unbalanced +distribution of CT scans' thickness, 2.5D ResUnet are adopted to build an +efficient coarse-to-fine semantic segmentation framework in this work. A set of +489 CT scans are used for training and validation, and an independent +never-before-used CT scans for testing. Finally, we demonstrate the +effectiveness of our proposed method. The dice values on test set are 0.954, +0.792, 0.691, the surface dice values are 0.897, 0.591, 0.541 for kidney, tumor +and cyst, respectively. The average inference time of each CT scan is 20.65s +and the max GPU memory is 3525MB. The results suggest that a better trade-off +between model performance and efficiency. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ A deep learning approach for marine snow synthesis and removal + + +
+ Marine snow, the floating particles in underwater images, severely degrades +the visibility and performance of human and machine vision systems. This paper +proposes a novel method to reduce the marine snow interference using deep +learning techniques. We first synthesize realistic marine snow samples by +training a Generative Adversarial Network (GAN) model and combine them with +natural underwater images to create a paired dataset. We then train a U-Net +model to perform marine snow removal as an image to image translation task. Our +experiments show that the U-Net model can effectively remove both synthetic and +natural marine snow with high accuracy, outperforming state-of-the-art methods +such as the Median filter and its adaptive variant. We also demonstrate the +robustness of our method by testing it on the MSRB dataset, which contains +synthetic artifacts that our model has not seen during training. Our method is +a practical and efficient solution for enhancing underwater images affected by +marine snow. + +
+
+
+
+
+ + ☆ Real Time GAZED: Online Shot Selection and Editing of Virtual Cameras + from Wide-Angle Monocular Video Recordings + + +
+ Eliminating time-consuming post-production processes and delivering +high-quality videos in today's fast-paced digital landscape are the key +advantages of real-time approaches. To address these needs, we present Real +Time GAZED: a real-time adaptation of the GAZED framework integrated with +CineFilter, a novel real-time camera trajectory stabilization approach. It +enables users to create professionally edited videos in real-time. Comparative +evaluations against baseline methods, including the non-real-time GAZED, +demonstrate that Real Time GAZED achieves similar editing results, ensuring +high-quality video output. Furthermore, a user study confirms the aesthetic +quality of the video edits produced by the Real Time GAZED approach. With these +advancements in real-time camera trajectory optimization and video editing +presented, the demand for immediate and dynamic content creation in industries +such as live broadcasting, sports coverage, news reporting, and social media +content creation can be met more efficiently. + +
+
+
+
+
+ + ☆ EucliDreamer: Fast and High-Quality Texturing for 3D Models with Stable + Diffusion Depth + + +
+ This paper presents a novel method to generate textures for 3D models given +text prompts and 3D meshes. Additional depth information is taken into account +to perform the Score Distillation Sampling (SDS) process [28] with depth +conditional Stable Diffusion [34]. We ran our model over the open-source +dataset Objaverse [7] and conducted a user study to compare the results with +those of various 3D texturing methods. We have shown that our model can +generate more satisfactory results and produce various art styles for the same +object. In addition, we achieved faster time when generating textures of +comparable quality. We also conduct thorough ablation studies of how different +factors may affect generation quality, including sampling steps, guidance +scale, negative prompts, data augmentation, elevation range, and alternatives +to SDS. + +
+
+
+
+
+ + ☆ Video-based Visible-Infrared Person Re-Identification with Auxiliary + Samples + + +
+ Visible-infrared person re-identification (VI-ReID) aims to match persons +captured by visible and infrared cameras, allowing person retrieval and +tracking in 24-hour surveillance systems. Previous methods focus on learning +from cross-modality person images in different cameras. However, temporal +information and single-camera samples tend to be neglected. To crack this nut, +in this paper, we first contribute a large-scale VI-ReID dataset named +BUPTCampus. Different from most existing VI-ReID datasets, it 1) collects +tracklets instead of images to introduce rich temporal information, 2) contains +pixel-aligned cross-modality sample pairs for better modality-invariant +learning, 3) provides one auxiliary set to help enhance the optimization, in +which each identity only appears in a single camera. Based on our constructed +dataset, we present a two-stream framework as baseline and apply Generative +Adversarial Network (GAN) to narrow the gap between the two modalities. To +exploit the advantages introduced by the auxiliary set, we propose a curriculum +learning based strategy to jointly learn from both primary and auxiliary sets. +Moreover, we design a novel temporal k-reciprocal re-ranking method to refine +the ranking list with fine-grained temporal correlation cues. Experimental +results demonstrate the effectiveness of the proposed methods. We also +reproduce 9 state-of-the-art image-based and video-based VI-ReID methods on +BUPTCampus and our methods show substantial superiority to them. The codes and +dataset are available at: https://github.com/dyhBUPT/BUPTCampus. + +
+
+ comment: Accepted by Transactions on Information Forensics & Security 2023 +
+
+
+
+
+ + ☆ UFDA: Universal Federated Domain Adaptation with Practical Assumptions AAAI2024 + + +
+ Conventional Federated Domain Adaptation (FDA) approaches usually demand an +abundance of assumptions, such as label set consistency, which makes them +significantly less feasible for real-world situations and introduces security +hazards. In this work, we propose a more practical scenario named Universal +Federated Domain Adaptation (UFDA). It only requires the black-box model and +the label set information of each source domain, while the label sets of +different source domains could be inconsistent and the target-domain label set +is totally blind. This relaxes the assumptions made by FDA, which are often +challenging to meet in real-world cases and diminish model security. To address +the UFDA scenario, we propose a corresponding framework called Hot-Learning +with Contrastive Label Disambiguation (HCLD), which tackles UFDA's domain +shifts and category gaps problem by using one-hot outputs from the black-box +models of various source domains. Moreover, to better distinguish the shared +and unknown classes, we further present a cluster-level strategy named +Mutual-Voting Decision (MVD) to extract robust consensus knowledge across peer +classes from both source and target domains. The extensive experiments on three +benchmarks demonstrate that our HCLD achieves comparable performance for our +UFDA scenario with much fewer assumptions, compared to the previous +methodologies with many additional assumptions. + +
+
+ comment: Submitted to AAAI2024 +
+
+
+
+
+ + ☆ Improving Adaptability and Generalizability of Efficient Transfer + Learning for Vision-Language Models + + +
+ Vision-Language Models (VLMs) like CLIP have demonstrated remarkable +applicability across a variety of downstream tasks, including zero-shot image +classification. Recently, the use of prompts or adapters for efficient transfer +learning has gained significant attention for effectively adapting to +downstream tasks. However, the roles of vision and text prompts, as well as +adapters in terms of generalization and transfer difficulty, have been +overlooked, limiting performance on unseen tasks. In this paper, we empirically +analyze how VLMs behave when using vision and text prompts, adapters, and a +combination of these components, marking a novel exploration by our study. Our +observations find that utilizing vision prompts for class separability and text +adapters for task adaptation is crucial for adaptability and generalizability. +Moreover, to improve generalization across every domain, we propose an adaptive +ensemble method that effectively combines the general knowledge of VLMs with +task-specific knowledge according to transfer difficulty. Upon experimenting +with extensive benchmarks, our method consistently outperforms all baselines, +particularly on unseen tasks, demonstrating the effectiveness of our proposed +approach. + +
+
+ comment: 11 pages (19 pages including supplementary), 10 figures (12 figures + including supplementary), 6 tables (17 tables including supplementary) +
+
+
+
+
+ + ☆ Fully Authentic Visual Question Answering Dataset from Online + Communities + + +
+ Visual Question Answering (VQA) entails answering questions about images. We +introduce the first VQA dataset in which all contents originate from an +authentic use case. Sourced from online question answering community forums, we +call it VQAonline. We then characterize our dataset and how it relates to eight +other VQA datasets. Observing that answers in our dataset tend to be much +longer (e.g., with a mean of 173 words) and thus incompatible with standard VQA +evaluation metrics, we next analyze which of the six popular metrics for longer +text evaluation align best with human judgments. We then use the best-suited +metrics to evaluate six state-of-the-art vision and language foundation models +on VQAonline and reveal where they struggle most. We will release the dataset +soon to facilitate future extensions. + +
+
+
+
+
+ + ☆ ET3D: Efficient Text-to-3D Generation via Multi-View Distillation + + +
+ Recent breakthroughs in text-to-image generation has shown encouraging +results via large generative models. Due to the scarcity of 3D assets, it is +hardly to transfer the success of text-to-image generation to that of +text-to-3D generation. Existing text-to-3D generation methods usually adopt the +paradigm of DreamFusion, which conducts per-asset optimization by distilling a +pretrained text-to-image diffusion model. The generation speed usually ranges +from several minutes to tens of minutes per 3D asset, which degrades the user +experience and also imposes a burden to the service providers due to the high +computational budget. + In this work, we present an efficient text-to-3D generation method, which +requires only around 8 $ms$ to generate a 3D asset given the text prompt on a +consumer graphic card. The main insight is that we exploit the images generated +by a large pre-trained text-to-image diffusion model, to supervise the training +of a text conditioned 3D generative adversarial network. Once the network is +trained, we are able to efficiently generate a 3D asset via a single forward +pass. Our method requires no 3D training data and provides an alternative +approach for efficient text-to-3D generation by distilling pre-trained image +diffusion models. + +
+
+
+
+
+ + ☆ PKU-I2IQA: An Image-to-Image Quality Assessment Database for AI + Generated Images + + +
+ With the development of image generation technology, AI-based image +generation has been applied in various fields. However, the development of AIGC +image generative models also brings new problems and challenges. A significant +challenge is that AI-generated images (AIGI) compared to natural images may +have some unique distortions, and not all generated images meet the +requirements of the real world, so it is of great significance to evaluate +AI-generated images more comprehensively. Although previous work has +established some human perception-based AIGC image quality assessment databases +for text-generated images, the AI image generation technology includes +scenarios like text-to-image and image-to-image, and assessing only the images +generated by text-to-image models is insufficient. To address this issue, we +have established a human perception-based image-to-image AIGC image quality +assessment database, named PKU-I2IQA. We conducted a comprehensive analysis of +the PKU-I2IQA database. Furthermore, we introduced two benchmark models: +NR-AIGCIQA based on no-reference image quality assessment and FR-AIGCIQA based +on full-reference image quality assessment.Finally, leveraging this database, +we conducted benchmark experiments and compared the performance of the proposed +benchmark models. The PKU-I2IQA database and benchmarks will be released to +facilitate future research on https://github.com/jiquan123/I2IQA. + Keywords: AIGC, image-to-image generation, image quality assessment, +NR-AIGCIQA, FR-AIGCIQA + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Instruct2Attack: Language-Guided Semantic Adversarial Attacks + + +
+ We propose Instruct2Attack (I2A), a language-guided semantic attack that +generates semantically meaningful perturbations according to free-form language +instructions. We make use of state-of-the-art latent diffusion models, where we +adversarially guide the reverse diffusion process to search for an adversarial +latent code conditioned on the input image and text instruction. Compared to +existing noise-based and semantic attacks, I2A generates more natural and +diverse adversarial examples while providing better controllability and +interpretability. We further automate the attack process with GPT-4 to generate +diverse image-specific text instructions. We show that I2A can successfully +break state-of-the-art deep neural networks even under strong adversarial +defenses, and demonstrate great transferability among a variety of network +architectures. + +
+
+ comment: under submission, code coming soon +
+
+
+
+
+ + ☆ Dataset Distillation in Latent Space + + +
+ Dataset distillation (DD) is a newly emerging research area aiming at +alleviating the heavy computational load in training models on large datasets. +It tries to distill a large dataset into a small and condensed one so that +models trained on the distilled dataset can perform comparably with those +trained on the full dataset when performing downstream tasks. Among the +previous works in this area, there are three key problems that hinder the +performance and availability of the existing DD methods: high time complexity, +high space complexity, and low info-compactness. In this work, we +simultaneously attempt to settle these three problems by moving the DD +processes from conventionally used pixel space to latent space. Encoded by a +pretrained generic autoencoder, latent codes in the latent space are naturally +info-compact representations of the original images in much smaller sizes. +After transferring three mainstream DD algorithms to latent space, we +significantly reduce time and space consumption while achieving similar +performance, allowing us to distill high-resolution datasets or target at +greater data ratio that previous methods have failed. Besides, within the same +storage budget, we can also quantitatively deliver more latent codes than +pixel-level images, which further boosts the performance of our methods. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Beyond Pixels: Exploring Human-Readable SVG Generation for Simple Images + with Vision Language Models + + +
+ In the field of computer graphics, the use of vector graphics, particularly +Scalable Vector Graphics (SVG), represents a notable development from +traditional pixel-based imagery. SVGs, with their XML-based format, are +distinct in their ability to directly and explicitly represent visual elements +such as shape, color, and path. This direct representation facilitates a more +accurate and logical depiction of graphical elements, enhancing reasoning and +interpretability. Recognizing the potential of SVGs, the machine learning +community has introduced multiple methods for image vectorization. However, +transforming images into SVG format while retaining the relational properties +and context of the original scene remains a key challenge. Most vectorization +methods often yield SVGs that are overly complex and not easily interpretable. +In response to this challenge, we introduce our method, Simple-SVG-Generation +(S\textsuperscript{2}VG\textsuperscript{2}). Our method focuses on producing +SVGs that are both accurate and simple, aligning with human readability and +understanding. With simple images, we evaluate our method with reasoning tasks +together with advanced language models, the results show a clear improvement +over previous SVG generation methods. We also conducted surveys for human +evaluation on the readability of our generated SVGs, the results also favor our +methods. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ EAFP-Med: An Efficient Adaptive Feature Processing Module Based on + Prompts for Medical Image Detection + + +
+ In the face of rapid advances in medical imaging, cross-domain adaptive +medical image detection is challenging due to the differences in lesion +representations across various medical imaging technologies. To address this +issue, we draw inspiration from large language models to propose EAFP-Med, an +efficient adaptive feature processing module based on prompts for medical image +detection. EAFP-Med can efficiently extract lesion features of different scales +from a diverse range of medical images based on prompts while being flexible +and not limited by specific imaging techniques. Furthermore, it serves as a +feature preprocessing module that can be connected to any model front-end to +enhance the lesion features in input images. Moreover, we propose a novel +adaptive disease detection model named EAFP-Med ST, which utilizes the Swin +Transformer V2 - Tiny (SwinV2-T) as its backbone and connects it to EAFP-Med. +We have compared our method to nine state-of-the-art methods. Experimental +results demonstrate that EAFP-Med ST achieves the best performance on all three +datasets (chest X-ray images, cranial magnetic resonance imaging images, and +skin images). EAFP-Med can efficiently extract lesion features from various +medical images based on prompts, enhancing the model's performance. This holds +significant potential for improving medical image analysis and diagnosis. + +
+
+
+
+
+ + ☆ SED: A Simple Encoder-Decoder for Open-Vocabulary Semantic Segmentation + + +
+ Open-vocabulary semantic segmentation strives to distinguish pixels into +different semantic groups from an open set of categories. Most existing methods +explore utilizing pre-trained vision-language models, in which the key is to +adopt the image-level model for pixel-level segmentation task. In this paper, +we propose a simple encoder-decoder, named SED, for open-vocabulary semantic +segmentation, which comprises a hierarchical encoder-based cost map generation +and a gradual fusion decoder with category early rejection. The hierarchical +encoder-based cost map generation employs hierarchical backbone, instead of +plain transformer, to predict pixel-level image-text cost map. Compared to +plain transformer, hierarchical backbone better captures local spatial +information and has linear computational complexity with respect to input size. +Our gradual fusion decoder employs a top-down structure to combine cost map and +the feature maps of different backbone levels for segmentation. To accelerate +inference speed, we introduce a category early rejection scheme in the decoder +that rejects many no-existing categories at the early layer of decoder, +resulting in at most 4.7 times acceleration without accuracy degradation. +Experiments are performed on multiple open-vocabulary semantic segmentation +datasets, which demonstrates the efficacy of our SED method. When using +ConvNeXt-B, our SED method achieves mIoU score of 31.6\% on ADE20K with 150 +categories at 82 millisecond ($ms$) per image on a single A6000. We will +release it at \url{https://github.com/xb534/SED.git}. + +
+
+
+
+
+ + ☆ SVRDA: A Web-based Dataset Annotation Tool for Slice-to-Volume + Registration + + +
+ Background and Objective: The lack of benchmark datasets has impeded the +development of slice-to-volume registration algorithms. Such datasets are +difficult to annotate, primarily due to the dimensional difference within data +and the dearth of task-specific software. We aim to develop a user-friendly +tool to streamline dataset annotation for slice-to-volume registration. + Methods: The proposed tool, named SVRDA, is an installation-free web +application for platform-agnostic collaborative dataset annotation. It enables +efficient transformation manipulation via keyboard shortcuts and smooth case +transitions with auto-saving. SVRDA supports configuration-based data loading +and adheres to the separation of concerns, offering great flexibility and +extensibility for future research. Various supplementary features have been +implemented to facilitate slice-to-volume registration. + Results: We validated the effectiveness of SVRDA by indirectly evaluating the +post-registration segmentation quality on UK Biobank data, observing a dramatic +overall improvement (24.02% in the Dice Similarity Coefficient and 48.93% in +the 95th percentile Hausdorff distance, respectively) supported by highly +statistically significant evidence ($p<0.001$).We further showcased the +clinical usage of SVRDA by integrating it into test-retest T1 quantification on +in-house magnetic resonance images, leading to more consistent results after +registration. + Conclusions: SVRDA can facilitate collaborative annotation of benchmark +datasets while being potentially applicable to other pipelines incorporating +slice-to-volume registration. Full source code and documentation are available +at https://github.com/Roldbach/SVRDA + +
+
+ comment: 18 pages, 11 figures, In submission to Computer Methods and Programs + in Biomedicine +
+
+
+
+
+ + ☆ Efficient Dataset Distillation via Minimax Diffusion + + +
+ Dataset distillation reduces the storage and computational consumption of +training a network by generating a small surrogate dataset that encapsulates +rich information of the original large-scale one. However, previous +distillation methods heavily rely on the sample-wise iterative optimization +scheme. As the images-per-class (IPC) setting or image resolution grows larger, +the necessary computation will demand overwhelming time and resources. In this +work, we intend to incorporate generative diffusion techniques for computing +the surrogate dataset. Observing that key factors for constructing an effective +surrogate dataset are representativeness and diversity, we design additional +minimax criteria in the generative training to enhance these facets for the +generated images of diffusion models. We present a theoretical model of the +process as hierarchical diffusion control demonstrating the flexibility of the +diffusion process to target these criteria without jeopardizing the +faithfulness of the sample to the desired distribution. The proposed method +achieves state-of-the-art validation performance while demanding much less +computational resources. Under the 100-IPC setting on ImageWoof, our method +requires less than one-twentieth the distillation time of previous methods, yet +yields even better performance. Source code available in +https://github.com/vimar-gu/MinimaxDiffusion. + +
+
+
+
+
+ + ☆ Sparse Pedestrian Character Learning for Trajectory Prediction + + +
+ Pedestrian trajectory prediction in a first-person view has recently +attracted much attention due to its importance in autonomous driving. Recent +work utilizes pedestrian character information, \textit{i.e.}, action and +appearance, to improve the learned trajectory embedding and achieves +state-of-the-art performance. However, it neglects the invalid and negative +pedestrian character information, which is harmful to trajectory representation +and thus leads to performance degradation. To address this issue, we present a +two-stream sparse-character-based network~(TSNet) for pedestrian trajectory +prediction. Specifically, TSNet learns the negative-removed characters in the +sparse character representation stream to improve the trajectory embedding +obtained in the trajectory representation stream. Moreover, to model the +negative-removed characters, we propose a novel sparse character graph, +including the sparse category and sparse temporal character graphs, to learn +the different effects of various characters in category and temporal +dimensions, respectively. Extensive experiments on two first-person view +datasets, PIE and JAAD, show that our method outperforms existing +state-of-the-art methods. In addition, ablation studies demonstrate different +effects of various characters and prove that TSNet outperforms approaches +without eliminating negative characters. + +
+
+
+
+
+ + ☆ CaesarNeRF: Calibrated Semantic Representation for Few-shot + Generalizable Neural Rendering + + +
+ Generalizability and few-shot learning are key challenges in Neural Radiance +Fields (NeRF), often due to the lack of a holistic understanding in pixel-level +rendering. We introduce CaesarNeRF, an end-to-end approach that leverages +scene-level CAlibratEd SemAntic Representation along with pixel-level +representations to advance few-shot, generalizable neural rendering, +facilitating a holistic understanding without compromising high-quality +details. CaesarNeRF explicitly models pose differences of reference views to +combine scene-level semantic representations, providing a calibrated holistic +understanding. This calibration process aligns various viewpoints with precise +location and is further enhanced by sequential refinement to capture varying +details. Extensive experiments on public datasets, including LLFF, Shiny, +mip-NeRF 360, and MVImgNet, show that CaesarNeRF delivers state-of-the-art +performance across varying numbers of reference views, proving effective even +with a single reference image. The project page of this work can be found at +https://haidongz-usc.github.io/project/caesarnerf. + +
+
+
+
+
+ + ☆ Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning + and Optimization Functions for Enhanced Precision + + +
+ Image registration has traditionally been done using two distinct approaches: +learning based methods, relying on robust deep neural networks, and +optimization-based methods, applying complex mathematical transformations to +warp images accordingly. Of course, both paradigms offer advantages and +disadvantages, and, in this work, we seek to combine their respective strengths +into a single streamlined framework, using the outputs of the learning based +method as initial parameters for optimization while prioritizing computational +power for the image pairs that offer the greatest loss. Our investigations +showed that an improvement of 0.3\% in testing when utilizing the best +performing state-of-the-art model as the backbone of the framework, while +maintaining the same inference time and with only a 0.8\% loss in deformation +field smoothness. + +
+
+
+
+
+ + ☆ AerialBooth: Mutual Information Guidance for Text Controlled Aerial View + Synthesis from a Single Image + + +
+ We present a novel method, AerialBooth, for synthesizing the aerial view from +a single input image using its text description. We leverage the pretrained +text-to-2D image stable diffusion model as prior knowledge of the 3D world. The +model is finetuned in two steps to optimize for the text embedding and the UNet +that reconstruct the input image and its inverse perspective mapping +respectively. The inverse perspective mapping creates variance within the +text-image space of the diffusion model, while providing weak guidance for +aerial view synthesis. At inference, we steer the contents of the generated +image towards the input image using novel mutual information guidance that +maximizes the information content between the probability distributions of the +two images. We evaluate our approach on a wide spectrum of real and synthetic +data, including natural scenes, indoor scenes, human action, etc. Through +extensive experiments and ablation studies, we demonstrate the effectiveness of +AerialBooth and also its generalizability to other text-controlled views. We +also show that AerialBooth achieves the best viewpoint-fidelity trade-off +though quantitative evaluation on 7 metrics analyzing viewpoint and fidelity +w.r.t. input image. Code and data is available at +https://github.com/divyakraman/AerialBooth2023. + +
+
+
+
+
+ + ☆ DreamCreature: Crafting Photorealistic Virtual Creatures from + Imagination + + +
+ Recent text-to-image (T2I) generative models allow for high-quality synthesis +following either text instructions or visual examples. Despite their +capabilities, these models face limitations in creating new, detailed creatures +within specific categories (e.g., virtual dog or bird species), which are +valuable in digital asset creation and biodiversity analysis. To bridge this +gap, we introduce a novel task, Virtual Creatures Generation: Given a set of +unlabeled images of the target concepts (e.g., 200 bird species), we aim to +train a T2I model capable of creating new, hybrid concepts within diverse +backgrounds and contexts. We propose a new method called DreamCreature, which +identifies and extracts the underlying sub-concepts (e.g., body parts of a +specific species) in an unsupervised manner. The T2I thus adapts to generate +novel concepts (e.g., new bird species) with faithful structures and +photorealistic appearance by seamlessly and flexibly composing learned +sub-concepts. To enhance sub-concept fidelity and disentanglement, we extend +the textual inversion technique by incorporating an additional projector and +tailored attention loss regularization. Extensive experiments on two +fine-grained image benchmarks demonstrate the superiority of DreamCreature over +prior methods in both qualitative and quantitative evaluation. Ultimately, the +learned sub-concepts facilitate diverse creative applications, including +innovative consumer product designs and nuanced property modifications. + +
+
+ comment: Website: https://github.com/kamwoh/dreamcreature +
+
+
+
+
+ + ☆ MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers + + +
+ We introduce MeshGPT, a new approach for generating triangle meshes that +reflects the compactness typical of artist-created meshes, in contrast to dense +triangle meshes extracted by iso-surfacing methods from neural fields. Inspired +by recent advances in powerful large language models, we adopt a sequence-based +approach to autoregressively generate triangle meshes as sequences of +triangles. We first learn a vocabulary of latent quantized embeddings, using +graph convolutions, which inform these embeddings of the local mesh geometry +and topology. These embeddings are sequenced and decoded into triangles by a +decoder, ensuring that they can effectively reconstruct the mesh. A transformer +is then trained on this learned vocabulary to predict the index of the next +embedding given previous embeddings. Once trained, our model can be +autoregressively sampled to generate new triangle meshes, directly generating +compact meshes with sharp edges, more closely imitating the efficient +triangulation patterns of human-crafted meshes. MeshGPT demonstrates a notable +improvement over state of the art mesh generation methods, with a 9% increase +in shape coverage and a 30-point enhancement in FID scores across various +categories. + +
+
+ comment: Project Page: https://nihalsid.github.io/mesh-gpt/, Video: + https://youtu.be/UV90O1_69_o +
+
+
+
+
+ + ♻ ☆ FutureHuman3D: Forecasting Complex Long-Term 3D Human Behavior from + Video Observations + + +
+ We present a generative approach to forecast long-term future human behavior +in 3D, requiring only weak supervision from readily available 2D human action +data. This is a fundamental task enabling many downstream applications. The +required ground-truth data is hard to capture in 3D (mocap suits, expensive +setups) but easy to acquire in 2D (simple RGB cameras). Thus, we design our +method to only require 2D RGB data while being able to generate 3D human motion +sequences. We use a differentiable 2D projection scheme in an autoregressive +manner for weak supervision, and an adversarial loss for 3D regularization. Our +method predicts long and complex behavior sequences (e.g. cooking, assembly) +consisting of multiple sub-actions. We tackle this in a semantically +hierarchical manner, jointly predicting high-level coarse action labels +together with their low-level fine-grained realizations as characteristic 3D +human poses. We observe that these two action representations are coupled in +nature, and joint prediction benefits both action and pose forecasting. Our +experiments demonstrate the complementary nature of joint action and 3D pose +prediction: our joint approach outperforms each task treated individually, +enables robust longer-term sequence prediction, and outperforms alternative +approaches to forecast actions and characteristic 3D poses. + +
+
+ comment: Project Page: https://future-human-3d.christian-diller.de/ Video: + https://www.youtube.com/watch?v=18du85YFXL0 +
+
+
+
+
+ + ♻ ☆ Self-Guided Diffusion Models CVPR 2023 + + +
+ Diffusion models have demonstrated remarkable progress in image generation +quality, especially when guidance is used to control the generative process. +However, guidance requires a large amount of image-annotation pairs for +training and is thus dependent on their availability, correctness and +unbiasedness. In this paper, we eliminate the need for such annotation by +instead leveraging the flexibility of self-supervision signals to design a +framework for self-guided diffusion models. By leveraging a feature extraction +function and a self-annotation function, our method provides guidance signals +at various image granularities: from the level of holistic images to object +boxes and even segmentation masks. Our experiments on single-label and +multi-label image datasets demonstrate that self-labeled guidance always +outperforms diffusion models without guidance and may even surpass guidance +based on ground-truth labels, especially on unbalanced data. When equipped with +self-supervised box or mask proposals, our method further generates visually +diverse yet semantically consistent images, without the need for any class, +box, or segment label annotation. Self-guided diffusion is simple, flexible and +expected to profit from deployment at scale. Source code will be at: +https://taohu.me/sgdm/ + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40\% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ UFOGen: You Forward Once Large Scale Text-to-Image Generation via + Diffusion GANs + + +
+ Text-to-image diffusion models have demonstrated remarkable capabilities in +transforming textual prompts into coherent images, yet the computational cost +of their inference remains a persistent challenge. To address this issue, we +present UFOGen, a novel generative model designed for ultra-fast, one-step +text-to-image synthesis. In contrast to conventional approaches that focus on +improving samplers or employing distillation techniques for diffusion models, +UFOGen adopts a hybrid methodology, integrating diffusion models with a GAN +objective. Leveraging a newly introduced diffusion-GAN objective and +initialization with pre-trained diffusion models, UFOGen excels in efficiently +generating high-quality images conditioned on textual descriptions in a single +step. Beyond traditional text-to-image generation, UFOGen showcases versatility +in applications. Notably, UFOGen stands among the pioneering models enabling +one-step text-to-image generation and diverse downstream tasks, presenting a +significant advancement in the landscape of efficient generative models. + +
+
+
+
+
+ + ♻ ☆ AST: Effective Dataset Distillation through Alignment with Smooth and + High-Quality Expert Trajectories + + +
+ Training large AI models typically requires large-scale datasets in the +machine learning process, making training and parameter-tuning process both +time-consuming and costly. Some researchers address this problem by carefully +synthesizing a very small number of highly representative and informative +samples from real-world datasets. This approach, known as Dataset Distillation +(DD), proposes a perspective for data-efficient learning. Despite recent +progress in this field, the performance of existing methods still cannot meet +expectations, and distilled datasets cannot effectively replace original +datasets. In this paper, unlike previous methods that focus solely on improving +the effectiveness of student distillation, we recognize and leverage the +important mutual influence between expert and student models. We observed that +the smoothness of expert trajectories has a significant impact on subsequent +student parameter alignment. Based on this, we propose an effective DD +framework named AST, standing for Alignment with Smooth and high-quality expert +Trajectories. We devise the integration of clipping loss and gradient penalty +to regulate the rate of parameter changes in expert trajectory generation. To +further refine the student parameter alignment with expert trajectory, we put +forward representative initialization for the synthetic dataset and balanced +inner-loop loss in response to the sensitivity exhibited towards randomly +initialized variables during distillation. We also propose two enhancement +strategies, namely intermediate matching loss and weight perturbation, to +mitigate the potential occurrence of cumulative errors. We conduct extensive +experiments on datasets of different scales, sizes, and resolutions. The +results demonstrate that the proposed method significantly outperforms prior +methods. + +
+
+
+
+
+ + ♻ ☆ Applications of Large Scale Foundation Models for Autonomous Driving + + +
+ Since DARPA Grand Challenges (rural) in 2004/05 and Urban Challenges in 2007, +autonomous driving has been the most active field of AI applications. Recently +powered by large language models (LLMs), chat systems, such as chatGPT and +PaLM, emerge and rapidly become a promising direction to achieve artificial +general intelligence (AGI) in natural language processing (NLP). There comes a +natural thinking that we could employ these abilities to reformulate autonomous +driving. By combining LLM with foundation models, it is possible to utilize the +human knowledge, commonsense and reasoning to rebuild autonomous driving +systems from the current long-tailed AI dilemma. In this paper, we investigate +the techniques of foundation models and LLMs applied for autonomous driving, +categorized as simulation, world model, data annotation and planning or E2E +solutions etc. + +
+
+ comment: 22 pages. arXiv admin note: text overlap with arXiv:2304.03589 by + other authors +
+
+
+
+
+ + ♻ ☆ From Isolated Islands to Pangea: Unifying Semantic Space for Human + Action Understanding + + +
+ As a vital step toward the intelligent agent, Action understanding matters +for intelligent agents and has attracted long-term attention. It can be formed +as the mapping from the action physical space to the semantic space. Typically, +researchers built action datasets according to idiosyncratic choices to define +classes and push the envelope of benchmarks respectively. Thus, datasets are +incompatible with each other like "Isolated Islands" due to semantic gaps and +various class granularities, e.g., do housework in dataset A and wash plate in +dataset B. We argue that a more principled semantic space is an urgent need to +concentrate the community efforts and enable us to use all datasets together to +pursue generalizable action learning. To this end, we design a structured +action semantic space in view of verb taxonomy hierarchy and covering massive +actions. By aligning the classes of previous datasets to our semantic space, we +gather (image/video/skeleton/MoCap) datasets into a unified database in a +unified label system, i.e., bridging ``isolated islands'' into a "Pangea". +Accordingly, we propose a novel model mapping from the physical space to +semantic space to fully use Pangea. In extensive experiments, our new system +shows significant superiority, especially in transfer learning. Code and data +will be made publicly available. + +
+
+ comment: Project Webpage: https://mvig-rhos.com/pangea +
+
+
+
+
+ + ♻ ☆ ENIGMA-51: Towards a Fine-Grained Understanding of Human-Object + Interactions in Industrial Scenarios + + +
+ ENIGMA-51 is a new egocentric dataset acquired in an industrial scenario by +19 subjects who followed instructions to complete the repair of electrical +boards using industrial tools (e.g., electric screwdriver) and equipments +(e.g., oscilloscope). The 51 egocentric video sequences are densely annotated +with a rich set of labels that enable the systematic study of human behavior in +the industrial domain. We provide benchmarks on four tasks related to human +behavior: 1) untrimmed temporal detection of human-object interactions, 2) +egocentric human-object interaction detection, 3) short-term object interaction +anticipation and 4) natural language understanding of intents and entities. +Baseline results show that the ENIGMA-51 dataset poses a challenging benchmark +to study human behavior in industrial scenarios. We publicly release the +dataset at https://iplab.dmi.unict.it/ENIGMA-51. + +
+
+
+
+
+ + ♻ ☆ The Chosen One: Consistent Characters in Text-to-Image Diffusion Models + + +
+ Recent advances in text-to-image generation models have unlocked vast +potential for visual creativity. However, these models struggle with generation +of consistent characters, a crucial aspect for numerous real-world applications +such as story visualization, game development asset design, advertising, and +more. Current methods typically rely on multiple pre-existing images of the +target character or involve labor-intensive manual processes. In this work, we +propose a fully automated solution for consistent character generation, with +the sole input being a text prompt. We introduce an iterative procedure that, +at each stage, identifies a coherent set of images sharing a similar identity +and extracts a more consistent identity from this set. Our quantitative +analysis demonstrates that our method strikes a better balance between prompt +alignment and identity consistency compared to the baseline methods, and these +findings are reinforced by a user study. To conclude, we showcase several +practical applications of our approach. Project page is available at +https://omriavrahami.com/the-chosen-one + +
+
+ comment: Project page is available at https://omriavrahami.com/the-chosen-one +
+
+
+
+
+ + ♻ ☆ LLM-driven Multimodal Target Volume Contouring in Radiation Oncology + + +
+ Target volume contouring for radiation therapy is considered significantly +more challenging than the normal organ segmentation tasks as it necessitates +the utilization of both image and text-based clinical information. Inspired by +the recent advancement of large language models (LLMs) that can facilitate the +integration of the textural information and images, here we present a novel +LLM-driven multi-modal AI that utilizes the clinical text information and is +applicable to the challenging task of target volume contouring for radiation +therapy, and validate it within the context of breast cancer radiation therapy +target volume contouring. Using external validation and data-insufficient +environments, which attributes highly conducive to real-world applications, we +demonstrate that the proposed model exhibits markedly improved performance +compared to conventional vision-only AI models, particularly exhibiting robust +generalization performance and data-efficiency. To our best knowledge, this is +the first LLM-driven multimodal AI model that integrates the clinical text +information into target volume delineation for radiation oncology. + +
+
+
+
+
+ + ♻ ☆ NEURAL MARIONETTE: A Transformer-based Multi-action Human Motion + Synthesis System + + +
+ We present a neural network-based system for long-term, multi-action human +motion synthesis. The system, dubbed as NEURAL MARIONETTE, can produce +high-quality and meaningful motions with smooth transitions from simple user +input, including a sequence of action tags with expected action duration, and +optionally a hand-drawn moving trajectory if the user specifies. The core of +our system is a novel Transformer-based motion generation model, namely +MARIONET, which can generate diverse motions given action tags. Different from +existing motion generation models, MARIONET utilizes contextual information +from the past motion clip and future action tag, dedicated to generating +actions that can smoothly blend historical and future actions. Specifically, +MARIONET first encodes target action tag and contextual information into an +action-level latent code. The code is unfolded into frame-level control signals +via a time unrolling module, which could be then combined with other +frame-level control signals like the target trajectory. Motion frames are then +generated in an auto-regressive way. By sequentially applying MARIONET, the +system NEURAL MARIONETTE can robustly generate long-term, multi-action motions +with the help of two simple schemes, namely "Shadow Start" and "Action +Revision". Along with the novel system, we also present a new dataset dedicated +to the multi-action motion synthesis task, which contains both action tags and +their contextual information. Extensive experiments are conducted to study the +action accuracy, naturalism, and transition smoothness of the motions generated +by our system. + +
+
+
+
+
+ + ♻ ☆ 3DGAUnet: 3D generative adversarial networks with a 3D U-Net based + generator to achieve the accurate and effective synthesis of clinical tumor + image data for pancreatic cancer + + +
+ Pancreatic ductal adenocarcinoma (PDAC) presents a critical global health +challenge, and early detection is crucial for improving the 5-year survival +rate. Recent medical imaging and computational algorithm advances offer +potential solutions for early diagnosis. Deep learning, particularly in the +form of convolutional neural networks (CNNs), has demonstrated success in +medical image analysis tasks, including classification and segmentation. +However, the limited availability of clinical data for training purposes +continues to provide a significant obstacle. Data augmentation, generative +adversarial networks (GANs), and cross-validation are potential techniques to +address this limitation and improve model performance, but effective solutions +are still rare for 3D PDAC, where contrast is especially poor owing to the high +heterogeneity in both tumor and background tissues. In this study, we developed +a new GAN-based model, named 3DGAUnet, for generating realistic 3D CT images of +PDAC tumors and pancreatic tissue, which can generate the interslice connection +data that the existing 2D CT image synthesis models lack. Our innovation is to +develop a 3D U-Net architecture for the generator to improve shape and texture +learning for PDAC tumors and pancreatic tissue. Our approach offers a promising +path to tackle the urgent requirement for creative and synergistic methods to +combat PDAC. The development of this GAN-based model has the potential to +alleviate data scarcity issues, elevate the quality of synthesized data, and +thereby facilitate the progression of deep learning models to enhance the +accuracy and early detection of PDAC tumors, which could profoundly impact +patient outcomes. Furthermore, this model has the potential to be adapted to +other types of solid tumors, hence making significant contributions to the +field of medical imaging in terms of image processing models. + +
+
+ comment: Published on Cancers: Shi, Yu, Hannah Tang, Michael J. Baine, Michael + A. Hollingsworth, Huijing Du, Dandan Zheng, Chi Zhang, and Hongfeng Yu. 2023. + "3DGAUnet: 3D Generative Adversarial Networks with a 3D U-Net Based Generator + to Achieve the Accurate and Effective Synthesis of Clinical Tumor Image Data + for Pancreatic Cancer" Cancers 15, no. 23: 5496 +
+
+
+
+
+ + ♻ ☆ CALICO: Self-Supervised Camera-LiDAR Contrastive Pre-training for BEV + Perception + + +
+ Perception is crucial in the realm of autonomous driving systems, where +bird's eye view (BEV)-based architectures have recently reached +state-of-the-art performance. The desirability of self-supervised +representation learning stems from the expensive and laborious process of +annotating 2D and 3D data. Although previous research has investigated +pretraining methods for both LiDAR and camera-based 3D object detection, a +unified pretraining framework for multimodal BEV perception is missing. In this +study, we introduce CALICO, a novel framework that applies contrastive +objectives to both LiDAR and camera backbones. Specifically, CALICO +incorporates two stages: point-region contrast (PRC) and region-aware +distillation (RAD). PRC better balances the region- and scene-level +representation learning on the LiDAR modality and offers significant +performance improvement compared to existing methods. RAD effectively achieves +contrastive distillation on our self-trained teacher model. CALICO's efficacy +is substantiated by extensive evaluations on 3D object detection and BEV map +segmentation tasks, where it delivers significant performance improvements. +Notably, CALICO outperforms the baseline method by 10.5% and 8.6% on NDS and +mAP. Moreover, CALICO boosts the robustness of multimodal 3D object detection +against adversarial attacks and corruption. Additionally, our framework can be +tailored to different backbones and heads, positioning it as a promising +approach for multimodal BEV perception. + +
+
+
+
+
+ + ♻ ☆ Perceptual Assessment and Optimization of High Dynamic Range Image + Rendering + + +
+ The increasing popularity of high dynamic range (HDR) imaging stems from its +ability to faithfully capture luminance levels in natural scenes. However, HDR +image quality assessment has been insufficiently addressed. Existing models are +mostly designed for low dynamic range (LDR) images, which exhibit poorly +correlated with human perception of HDR image quality. To fill this gap, we +propose a family of HDR quality metrics by transferring the recent advancements +in LDR domain. The key step in our approach is to employ a simple inverse +display model to decompose an HDR image into a stack of LDR images with varying +exposures. Subsequently, these LDR images are evaluated using state-of-the-art +LDR quality metrics. Our family of HDR quality models offer three notable +advantages. First, specific exposures (i.e., luminance ranges) can be weighted +to emphasize their assessment when calculating the overall quality score. +Second, our HDR quality metrics directly inherit the capabilities of their base +LDR quality models in assessing LDR images. Third, our metrics do not rely on +human perceptual data of HDR image quality for re-calibration. Experiments +conducted on four human-rated HDR image quality datasets indicate that our HDR +quality metrics consistently outperform existing methods, including the HDR-VDP +family. Furthermore, we demonstrate the promise of our models in the perceptual +optimization of HDR novel view synthesis. + +
+
+
+
+
+ + ♻ ☆ AI-Generated Images Introduce Invisible Relevance Bias to Text-Image + Retrieval + + +
+ With the advancement of generation models, AI-generated content (AIGC) is +becoming more realistic, flooding the Internet. A recent study suggests that +this phenomenon has elevated the issue of source bias in text retrieval for web +searches. Specifically, neural retrieval models tend to rank generated texts +higher than human-written texts. In this paper, we extend the study of this +bias to cross-modal retrieval. Firstly, we successfully construct a suitable +benchmark to explore the existence of the bias. Subsequent extensive +experiments on this benchmark reveal that AI-generated images introduce an +invisible relevance bias to text-image retrieval models. Specifically, our +experiments show that text-image retrieval models tend to rank the AI-generated +images higher than the real images, even though the AI-generated images do not +exhibit more visually relevant features to the query than real images. This +invisible relevance bias is prevalent across retrieval models with varying +training data and architectures. Furthermore, our subsequent exploration +reveals that the inclusion of AI-generated images in the training data of the +retrieval models exacerbates the invisible relevance bias. The above phenomenon +triggers a vicious cycle, which makes the invisible relevance bias become more +and more serious. To elucidate the potential causes of invisible relevance and +address the aforementioned issues, we introduce an effective training method +aimed at alleviating the invisible relevance bias. Subsequently, we apply our +proposed debiasing method to retroactively identify the causes of invisible +relevance, revealing that the AI-generated images induce the image encoder to +embed additional information into their representation. This information +exhibits a certain consistency across generated images with different semantics +and can make the retriever estimate a higher relevance score. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors + + +
+ Animating a still image offers an engaging visual experience. Traditional +image animation techniques mainly focus on animating natural scenes with +stochastic dynamics (e.g. clouds and fluid) or domain-specific motions (e.g. +human hair or body motions), and thus limits their applicability to more +general visual content. To overcome this limitation, we explore the synthesis +of dynamic content for open-domain images, converting them into animated +videos. The key idea is to utilize the motion prior of text-to-video diffusion +models by incorporating the image into the generative process as guidance. +Given an image, we first project it into a text-aligned rich context +representation space using a query transformer, which facilitates the video +model to digest the image content in a compatible fashion. However, some visual +details still struggle to be preserved in the resultant videos. To supplement +with more precise image information, we further feed the full image to the +diffusion model by concatenating it with the initial noises. Experimental +results show that our proposed method can produce visually convincing and more +logical & natural motions, as well as higher conformity to the input image. +Comparative evaluation demonstrates the notable superiority of our approach +over existing competitors. + +
+
+ comment: Project page: https://doubiiu.github.io/projects/DynamiCrafter +
+
+
+
+
+ + ♻ ☆ Efficient Perception, Planning, and Control Algorithms for Vision-Based + Automated Vehicles + + +
+ Autonomous vehicles have limited computational resources; hence, their +control systems must be efficient. The cost and size of sensors have limited +the development of self-driving cars. To overcome these restrictions, this +study proposes an efficient framework for the operation of vision-based +automatic vehicles; the framework requires only a monocular camera and a few +inexpensive radars. The proposed algorithm comprises a multi-task UNet (MTUNet) +network for extracting image features and constrained iterative linear +quadratic regulator (CILQR) and vision predictive control (VPC) modules for +rapid motion planning and control. MTUNet is designed to simultaneously solve +lane line segmentation, the ego vehicle's heading angle regression, road type +classification, and traffic object detection tasks at approximately 40 FPS +(frames per second) for 228 x 228 pixel RGB input images. The CILQR controllers +then use the MTUNet outputs and radar data as inputs to produce driving +commands for lateral and longitudinal vehicle guidance within only 1 ms. In +particular, the VPC algorithm is included to reduce steering command latency to +below actuator latency to prevent self-driving vehicle performance degradation +during tight turns. The VPC algorithm uses road curvature data from MTUNet to +estimate the correction of the current steering angle at a look-ahead point to +adjust the turning amount. Including the VPC algorithm in a VPC-CILQR +controller on curvy roads leads to higher performance than CILQR alone. Our +experiments demonstrate that the proposed autonomous driving system, which does +not require high-definition maps, could be applied in current autonomous +vehicles. + +
+
+ comment: 10 figures, 13 pages +
+
+
+
+
+ + ♻ ☆ Continual Test-time Domain Adaptation via Dynamic Sample Selection + + +
+ The objective of Continual Test-time Domain Adaptation (CTDA) is to gradually +adapt a pre-trained model to a sequence of target domains without accessing the +source data. This paper proposes a Dynamic Sample Selection (DSS) method for +CTDA. DSS consists of dynamic thresholding, positive learning, and negative +learning processes. Traditionally, models learn from unlabeled unknown +environment data and equally rely on all samples' pseudo-labels to update their +parameters through self-training. However, noisy predictions exist in these +pseudo-labels, so all samples are not equally trustworthy. Therefore, in our +method, a dynamic thresholding module is first designed to select suspected +low-quality from high-quality samples. The selected low-quality samples are +more likely to be wrongly predicted. Therefore, we apply joint positive and +negative learning on both high- and low-quality samples to reduce the risk of +using wrong information. We conduct extensive experiments that demonstrate the +effectiveness of our proposed method for CTDA in the image domain, +outperforming the state-of-the-art results. Furthermore, our approach is also +evaluated in the 3D point cloud domain, showcasing its versatility and +potential for broader applicability. + +
+
+ comment: 2024 IEEE/CVF Winter Conference on Applications of Computer Vision +
+
+
+
+
+ + ♻ ☆ A Closer Look at Audio-Visual Segmentation + + +
+ Audio-visual segmentation (AVS) is a complex task that involves accurately +segmenting the corresponding sounding object based on audio-visual queries. +Successful audio-visual learning requires two essential components: 1) an +unbiased dataset with high-quality pixel-level multi-class labels, and 2) a +model capable of effectively linking audio information with its corresponding +visual object. However, these two requirements are only partially addressed by +current methods, with training sets containing biased audio-visual data, and +models that generalise poorly beyond this biased training set. In this work, we +propose a new strategy to build cost-effective and relatively unbiased +audio-visual semantic segmentation benchmarks. Our strategy, called Visual +Post-production (VPO), explores the observation that it is not necessary to +have explicit audio-visual pairs extracted from single video sources to build +such benchmarks. We also refine the previously proposed AVSBench to transform +it into the audio-visual semantic segmentation benchmark AVSBench-Single+. +Furthermore, this paper introduces a new pixel-wise audio-visual contrastive +learning method to enable a better generalisation of the model beyond the +training set. We verify the validity of the VPO strategy by showing that +state-of-the-art (SOTA) models trained with datasets built by matching audio +and visual data from different sources or with datasets containing audio and +visual data from the same video source produce almost the same accuracy. Then, +using the proposed VPO benchmarks and AVSBench-Single+, we show that our method +produces more accurate audio-visual semantic segmentation than SOTA models. +Code and dataset will be available. + +
+
+
+
+
+ + ♻ ☆ AdaptGuard: Defending Against Universal Attacks for Model Adaptation ICCV2023 + + +
+ Model adaptation aims at solving the domain transfer problem under the +constraint of only accessing the pretrained source models. With the increasing +considerations of data privacy and transmission efficiency, this paradigm has +been gaining recent popularity. This paper studies the vulnerability to +universal attacks transferred from the source domain during model adaptation +algorithms due to the existence of malicious providers. We explore both +universal adversarial perturbations and backdoor attacks as loopholes on the +source side and discover that they still survive in the target models after +adaptation. To address this issue, we propose a model preprocessing framework, +named AdaptGuard, to improve the security of model adaptation algorithms. +AdaptGuard avoids direct use of the risky source parameters through knowledge +distillation and utilizes the pseudo adversarial samples under adjusted radius +to enhance the robustness. AdaptGuard is a plug-and-play module that requires +neither robust pretrained models nor any changes for the following model +adaptation algorithms. Extensive results on three commonly used datasets and +two popular adaptation methods validate that AdaptGuard can effectively defend +against universal attacks and maintain clean accuracy in the target domain +simultaneously. We hope this research will shed light on the safety and +robustness of transfer learning. Code is available at +https://github.com/TomSheng21/AdaptGuard. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ RealignDiff: Boosting Text-to-Image Diffusion Model with Coarse-to-fine + Semantic Re-alignment + + +
+ Recent advances in text-to-image diffusion models have achieved remarkable +success in generating high-quality, realistic images from textual descriptions. +However, these approaches have faced challenges in precisely aligning the +generated visual content with the textual concepts described in the prompts. In +this paper, we propose a two-stage coarse-to-fine semantic re-alignment method, +named RealignDiff, aimed at improving the alignment between text and images in +text-to-image diffusion models. In the coarse semantic re-alignment phase, a +novel caption reward, leveraging the BLIP-2 model, is proposed to evaluate the +semantic discrepancy between the generated image caption and the given text +prompt. Subsequently, the fine semantic re-alignment stage employs a local +dense caption generation module and a re-weighting attention modulation module +to refine the previously generated images from a local semantic view. +Experimental results on the MS-COCO benchmark demonstrate that the proposed +two-stage coarse-to-fine semantic re-alignment method outperforms other +baseline re-alignment techniques by a substantial margin in both visual quality +and semantic similarity with the input prompt. + +
+
+
+
+
+ + ♻ ☆ TetraSphere: A Neural Descriptor for O(3)-Invariant Point Cloud Analysis + + +
+ In many practical applications, 3D point cloud analysis requires rotation +invariance. In this paper, we present a learnable descriptor invariant under 3D +rotations and reflections, i.e., the O(3) actions, utilizing the recently +introduced steerable 3D spherical neurons and vector neurons. Specifically, we +propose an embedding of the 3D spherical neurons into 4D vector neurons, which +leverages end-to-end training of the model. In our approach, we perform +TetraTransform--an equivariant embedding of the 3D input into 4D, constructed +from the steerable neurons--and extract deeper O(3)-equivariant features using +vector neurons. This integration of the TetraTransform into the VN-DGCNN +framework, termed TetraSphere, negligibly increases the number of parameters by +less than 0.0002%. TetraSphere sets a new state-of-the-art performance +classifying randomly rotated real-world object scans of the challenging subsets +of ScanObjectNN. Additionally, TetraSphere outperforms all equivariant methods +on randomly rotated synthetic data: classifying objects from ModelNet40 and +segmenting parts of the ShapeNet shapes. Thus, our results reveal the practical +value of steerable 3D spherical neurons for learning in 3D Euclidean space. + +
+
+
+
+
+ + ♻ ☆ DoUnseen: Tuning-Free Class-Adaptive Object Detection of Unseen Objects + for Robotic Grasping + + +
+ How can we segment varying numbers of objects where each specific object +represents its own separate class? To make the problem even more realistic, how +can we add and delete classes on the fly without retraining or fine-tuning? +This is the case of robotic applications where no datasets of the objects exist +or application that includes thousands of objects (E.g., in logistics) where it +is impossible to train a single model to learn all of the objects. Most current +research on object segmentation for robotic grasping focuses on class-level +object segmentation (E.g., box, cup, bottle), closed sets (specific objects of +a dataset; for example, YCB dataset), or deep learning-based template matching. +In this work, we are interested in open sets where the number of classes is +unknown, varying, and without pre-knowledge about the objects' types. We +consider each specific object as its own separate class. Our goal is to develop +an object detector that requires no fine-tuning and can add any object as a +class just by capturing a few images of the object. Our main idea is to break +the segmentation pipelines into two steps by combining unseen object +segmentation networks cascaded by class-adaptive classifiers. We evaluate our +class-adaptive object detector on unseen datasets and compare it to a trained +Mask R-CNN on those datasets. The results show that the performance varies from +practical to unsuitable depending on the environment setup and the objects +being handled. The code is available in our DoUnseen library repository. + +
+
+ comment: presented at RSS 2023 Workshop on Perception and Manipulation + Challenges for Warehouse Automation +
+
+
+
+
+ + ♻ ☆ RealLiFe: Real-Time Light Field Reconstruction via Hierarchical Sparse + Gradient Descent + + +
+ With the rise of Extended Reality (XR) technology, there is a growing need +for real-time light field generation from sparse view inputs. Existing methods +can be classified into offline techniques, which can generate high-quality +novel views but at the cost of long inference/training time, and online +methods, which either lack generalizability or produce unsatisfactory results. +However, we have observed that the intrinsic sparse manifold of Multi-plane +Images (MPI) enables a significant acceleration of light field generation while +maintaining rendering quality. Based on this insight, we introduce EffLiFe, a +novel light field optimization method, which leverages the proposed +Hierarchical Sparse Gradient Descent (HSGD) to produce high-quality light +fields from sparse view images in real time. Technically, the coarse MPI of a +scene is first generated using a 3D CNN, and it is further sparsely optimized +by focusing only on important MPI gradients in a few iterations. Nevertheless, +relying solely on optimization can lead to artifacts at occlusion boundaries. +Therefore, we propose an occlusion-aware iterative refinement module that +removes visual artifacts in occluded regions by iteratively filtering the +input. Extensive experiments demonstrate that our method achieves comparable +visual quality while being 100x faster on average than state-of-the-art offline +methods and delivering better performance (about 2 dB higher in PSNR) compared +to other online approaches. + +
+
+ comment: Submitted to IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ PanoVOS: Bridging Non-panoramic and Panoramic Views with Transformer for + Video Segmentation + + +
+ Panoramic videos contain richer spatial information and have attracted +tremendous amounts of attention due to their exceptional experience in some +fields such as autonomous driving and virtual reality. However, existing +datasets for video segmentation only focus on conventional planar images. To +address the challenge, in this paper, we present a panoramic video dataset, +PanoVOS. The dataset provides 150 videos with high video resolutions and +diverse motions. To quantify the domain gap between 2D planar videos and +panoramic videos, we evaluate 15 off-the-shelf video object segmentation (VOS) +models on PanoVOS. Through error analysis, we found that all of them fail to +tackle pixel-level content discontinues of panoramic videos. Thus, we present a +Panoramic Space Consistency Transformer (PSCFormer), which can effectively +utilize the semantic boundary information of the previous frame for pixel-level +matching with the current frame. Extensive experiments demonstrate that +compared with the previous SOTA models, our PSCFormer network exhibits a great +advantage in terms of segmentation results under the panoramic setting. Our +dataset poses new challenges in panoramic VOS and we hope that our PanoVOS can +advance the development of panoramic segmentation/tracking. + +
+
+
+
+
+ + ♻ ☆ Prompt-based test-time real image dehazing: a novel pipeline + + +
+ Existing methods attempt to improve models' generalization ability on +real-world hazy images by exploring well-designed training schemes (e.g., +CycleGAN, prior loss). However, most of them need very complicated training +procedures to achieve satisfactory results. In this work, we present a totally +novel testing pipeline called Prompt-based Test-Time Dehazing (PTTD) to help +generate visually pleasing results of real-captured hazy images during the +inference phase. We experimentally find that given a dehazing model trained on +synthetic data, by fine-tuning the statistics (i.e., mean and standard +deviation) of encoding features, PTTD is able to narrow the domain gap, +boosting the performance of real image dehazing. Accordingly, we first apply a +prompt generation module (PGM) to generate a visual prompt, which is the source +of appropriate statistical perturbations for mean and standard deviation. And +then, we employ the feature adaptation module (FAM) into the existing dehazing +models for adjusting the original statistics with the guidance of the generated +prompt. Note that, PTTD is model-agnostic and can be equipped with various +state-of-the-art dehazing models trained on synthetic hazy-clean pairs. +Extensive experimental results demonstrate that our PTTD is flexible meanwhile +achieves superior performance against state-of-the-art dehazing methods in +real-world scenarios. The source code of our PTTD will be made available at +https://github.com/cecret3350/PTTD-Dehazing. + +
+
+ comment: update github link (https://github.com/cecret3350/PTTD-Dehazing) +
+
+
+
+
+ + ♻ ☆ MRGazer: Decoding Eye Gaze Points from Functional Magnetic Resonance + Imaging in Individual Space + + +
+ Eye-tracking research has proven valuable in understanding numerous cognitive +functions. Recently, Frey et al. provided an exciting deep learning method for +learning eye movements from fMRI data. However, it needed to co-register fMRI +into standard space to obtain eyeballs masks, and thus required additional +templates and was time consuming. To resolve this issue, in this paper, we +propose a framework named MRGazer for predicting eye gaze points from fMRI in +individual space. The MRGazer consisted of eyeballs extraction module and a +residual network-based eye gaze prediction. Compared to the previous method, +the proposed framework skips the fMRI co-registration step, simplifies the +processing protocol and achieves end-to-end eye gaze regression. The proposed +method achieved superior performance in a variety of eye movement tasks than +the co-registration-based method, and delivered objective results within a +shorter time (~ 0.02 Seconds for each volume) than prior method (~0.3 Seconds +for each volume). + +
+
+
+
+
+ + ♻ ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation + + +
+ Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced +significant growth and have been extensively employed to adapt large vision and +language models to various domains, enabling satisfactory model performance +with minimal computational needs. Despite these advances, more research has yet +to delve into potential PEFT applications in real-life scenarios, particularly +in the critical domains of remote sensing and crop monitoring. The diversity of +climates across different regions and the need for comprehensive large-scale +datasets have posed significant obstacles to accurately identify crop types +across varying geographic locations and changing growing seasons. This study +seeks to bridge this gap by comprehensively exploring the feasibility of +cross-area and cross-year out-of-distribution generalization using the +State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to +explore PEFT approaches for crop monitoring. Specifically, we focus on adapting +the SOTA TSViT model to address winter wheat field segmentation, a critical +task for crop monitoring and food security. This adaptation process involves +integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and +prompt tuning. Using PEFT techniques, we achieved notable results comparable to +those achieved using full fine-tuning methods while training only a mere 0.7% +parameters of the whole TSViT architecture. The in-house labeled data-set, +referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated +polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over +five consecutive years. Using Sentinel-2 images, our model achieved a 84% +F1-score. We intend to publicly release the Lebanese winter wheat data set, +code repository, and model weights. + +
+
+
+
+
+ + ♻ ☆ NeRFlame: FLAME-based conditioning of NeRF for 3D face rendering + + +
+ Traditional 3D face models are based on mesh representations with texture. +One of the most important models is FLAME (Faces Learned with an Articulated +Model and Expressions), which produces meshes of human faces that are fully +controllable. Unfortunately, such models have problems with capturing geometric +and appearance details. In contrast to mesh representation, the neural radiance +field (NeRF) produces extremely sharp renders. However, implicit methods are +hard to animate and do not generalize well to unseen expressions. It is not +trivial to effectively control NeRF models to obtain face manipulation. + The present paper proposes a novel approach, named NeRFlame, which combines +the strengths of both NeRF and FLAME methods. Our method enables high-quality +rendering capabilities of NeRF while also offering complete control over the +visual appearance, similar to FLAME. In contrast to traditional NeRF-based +structures that use neural networks for RGB color and volume density modeling, +our approach utilizes the FLAME mesh as a distinct density volume. +Consequently, color values exist only in the vicinity of the FLAME mesh. This +FLAME framework is seamlessly incorporated into the NeRF architecture for +predicting RGB colors, enabling our model to explicitly represent volume +density and implicitly capture RGB colors. + +
+
+
+
+
+ + ♻ ☆ RankFeat&RankWeight: Rank-1 Feature/Weight Removal for + Out-of-distribution Detection + + +
+ The task of out-of-distribution (OOD) detection is crucial for deploying +machine learning models in real-world settings. In this paper, we observe that +the singular value distributions of the in-distribution (ID) and OOD features +are quite different: the OOD feature matrix tends to have a larger dominant +singular value than the ID feature, and the class predictions of OOD samples +are largely determined by it. This observation motivates us to propose +\texttt{RankFeat}, a simple yet effective \emph{post hoc} approach for OOD +detection by removing the rank-1 matrix composed of the largest singular value +and the associated singular vectors from the high-level feature. +\texttt{RankFeat} achieves \emph{state-of-the-art} performance and reduces the +average false positive rate (FPR95) by 17.90\% compared with the previous best +method. The success of \texttt{RankFeat} motivates us to investigate whether a +similar phenomenon would exist in the parameter matrices of neural networks. We +thus propose \texttt{RankWeight} which removes the rank-1 weight from the +parameter matrices of a single deep layer. Our \texttt{RankWeight}is also +\emph{post hoc} and only requires computing the rank-1 matrix once. As a +standalone approach, \texttt{RankWeight} has very competitive performance +against other methods across various backbones. Moreover, \texttt{RankWeight} +enjoys flexible compatibility with a wide range of OOD detection methods. The +combination of \texttt{RankWeight} and \texttt{RankFeat} refreshes the new +\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\% on +the ImageNet-1k benchmark. Extensive ablation studies and comprehensive +theoretical analyses are presented to support the empirical results. + +
+
+ comment: submitted to T-PAMI. arXiv admin note: substantial text overlap with + arXiv:2209.08590 +
+
+
+
+
+ + ♻ ☆ Uncovering the Hidden Cost of Model Compression + + +
+ In the era of resource-intensive foundation models, efficient adaptation in +downstream tasks has become paramount. Visual Prompting (VP), inspired by +prompting in Large Language Models (LLMs), has emerged as a key transfer +learning method in computer vision. Aligned with the growing significance of +efficiency, research in model compression has become pivotal to alleviate the +computational burden in both training and deploying over-parameterized neural +networks. A key goal in model compression is the development of sparse models +capable of matching or surpassing the performance of their over-parameterized, +dense counterparts. While prior research has explored the impact of model +sparsity on transfer learning, its effects on visual prompting-based transfer +remain unclear. This study addresses this gap, revealing that model sparsity +adversely affects the performance of visual prompting-based transfer, +particularly in low-data-volume scenarios. Furthermore, our findings highlight +the negative influence of sparsity on the calibration of downstream +visual-prompted models. This empirical exploration calls for a nuanced +understanding beyond accuracy in sparse settings, opening avenues for further +research in Visual Prompting for sparse models. Code and logs can be accessed +at https://github.com/landskape-ai/Reprogram_LT . + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Towards Omni-supervised Referring Expression Segmentation + + +
+ Referring Expression Segmentation (RES) is an emerging task in computer +vision, which segments the target instances in images based on text +descriptions. However, its development is plagued by the expensive segmentation +labels. To address this issue, we propose a new learning task for RES called +Omni-supervised Referring Expression Segmentation (Omni-RES), which aims to +make full use of unlabeled, fully labeled and weakly labeled data, e.g., +referring points or grounding boxes, for efficient RES training. To accomplish +this task, we also propose a novel yet strong baseline method for Omni-RES +based on the recently popular teacher-student learning, where the weak labels +are not directly transformed into supervision signals but used as a yardstick +to select and refine high-quality pseudo-masks for teacher-student learning. To +validate the proposed Omni-RES method, we apply it to a set of state-of-the-art +RES models and conduct extensive experiments on a bunch of RES datasets. The +experimental results yield the obvious merits of Omni-RES than the +fully-supervised and semi-supervised training schemes. For instance, with only +10% fully labeled data, Omni-RES can help the base model achieve 100% fully +supervised performance, and it also outperform the semi-supervised alternative +by a large margin, e.g., +14.93% on RefCOCO and +14.95% on RefCOCO+, +respectively. More importantly, Omni-RES also enable the use of large-scale +vision-langauges like Visual Genome to facilitate low-cost RES training, and +achieve new SOTA performance of RES, e.g., 80.66 on RefCOCO. + +
+
+
+
+
+ + ♻ ☆ R&B: Region and Boundary Aware Zero-shot Grounded Text-to-image + Generation + + +
+ Recent text-to-image (T2I) diffusion models have achieved remarkable progress +in generating high-quality images given text-prompts as input. However, these +models fail to convey appropriate spatial composition specified by a layout +instruction. In this work, we probe into zero-shot grounded T2I generation with +diffusion models, that is, generating images corresponding to the input layout +information without training auxiliary modules or finetuning diffusion models. +We propose a Region and Boundary (R&B) aware cross-attention guidance approach +that gradually modulates the attention maps of diffusion model during +generative process, and assists the model to synthesize images (1) with high +fidelity, (2) highly compatible with textual input, and (3) interpreting layout +instructions accurately. Specifically, we leverage the discrete sampling to +bridge the gap between consecutive attention maps and discrete layout +constraints, and design a region-aware loss to refine the generative layout +during diffusion process. We further propose a boundary-aware loss to +strengthen object discriminability within the corresponding regions. +Experimental results show that our method outperforms existing state-of-the-art +zero-shot grounded T2I generation methods by a large margin both qualitatively +and quantitatively on several benchmarks. + +
+
+ comment: Preprint. Under review. Project page: + https://sagileo.github.io/Region-and-Boundary +
+
+
+
+
+ + ♻ ☆ Concept Sliders: LoRA Adaptors for Precise Control in Diffusion Models + + +
+ We present a method to create interpretable concept sliders that enable +precise control over attributes in image generations from diffusion models. Our +approach identifies a low-rank parameter direction corresponding to one concept +while minimizing interference with other attributes. A slider is created using +a small set of prompts or sample images; thus slider directions can be created +for either textual or visual concepts. Concept Sliders are plug-and-play: they +can be composed efficiently and continuously modulated, enabling precise +control over image generation. In quantitative experiments comparing to +previous editing techniques, our sliders exhibit stronger targeted edits with +lower interference. We showcase sliders for weather, age, styles, and +expressions, as well as slider compositions. We show how sliders can transfer +latents from StyleGAN for intuitive editing of visual concepts for which +textual description is difficult. We also find that our method can help address +persistent quality issues in Stable Diffusion XL including repair of object +deformations and fixing distorted hands. Our code, data, and trained sliders +are available at https://sliders.baulab.info/ + +
+
+
+
+
+ + ♻ ☆ Breaking Modality Disparity: Harmonized Representation for Infrared and + Visible Image Registration + + +
+ Since the differences in viewing range, resolution and relative position, the +multi-modality sensing module composed of infrared and visible cameras needs to +be registered so as to have more accurate scene perception. In practice, manual +calibration-based registration is the most widely used process, and it is +regularly calibrated to maintain accuracy, which is time-consuming and +labor-intensive. To cope with these problems, we propose a scene-adaptive +infrared and visible image registration. Specifically, in regard of the +discrepancy between multi-modality images, an invertible translation process is +developed to establish a modality-invariant domain, which comprehensively +embraces the feature intensity and distribution of both infrared and visible +modalities. We employ homography to simulate the deformation between different +planes and develop a hierarchical framework to rectify the deformation inferred +from the proposed latent representation in a coarse-to-fine manner. For that, +the advanced perception ability coupled with the residual estimation conducive +to the regression of sparse offsets, and the alternate correlation search +facilitates a more accurate correspondence matching. Moreover, we propose the +first ground truth available misaligned infrared and visible image dataset, +involving three synthetic sets and one real-world set. Extensive experiments +validate the effectiveness of the proposed method against the +state-of-the-arts, advancing the subsequent applications. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Discrete approximations of Gaussian smoothing and Gaussian derivatives + + +
+ This paper develops an in-depth treatment concerning the problem of +approximating the Gaussian smoothing and Gaussian derivative computations in +scale-space theory for application on discrete data. With close connections to +previous axiomatic treatments of continuous and discrete scale-space theory, we +consider three main ways discretizing these scale-space operations in terms of +explicit discrete convolutions, based on either (i) sampling the Gaussian +kernels and the Gaussian derivative kernels, (ii) locally integrating the +Gaussian kernels and the Gaussian derivative kernels over each pixel support +region and (iii) basing the scale-space analysis on the discrete analogue of +the Gaussian kernel, and then computing derivative approximations by applying +small-support central difference operators to the spatially smoothed image +data. + We study the properties of these three main discretization methods both +theoretically and experimentally, and characterize their performance by +quantitative measures, including the results they give rise to with respect to +the task of scale selection, investigated for four different use cases, and +with emphasis on the behaviour at fine scales. The results show that the +sampled Gaussian kernels and derivatives as well as the integrated Gaussian +kernels and derivatives perform very poorly at very fine scales. At very fine +scales, the discrete analogue of the Gaussian kernel with its corresponding +discrete derivative approximations performs substantially better. The sampled +Gaussian kernel and the sampled Gaussian derivatives do, on the other hand, +lead to numerically very good approximations of the corresponding continuous +results, when the scale parameter is sufficiently large, in the experiments +presented in the paper, when the scale parameter is greater than a value of +about 1, in units of the grid spacing. + +
+
+ comment: 38 pages, 34 figures +
+
+
+
+
+ + ♻ ☆ Scale-Adaptive Feature Aggregation for Efficient Space-Time Video + Super-Resolution WACV2024 + + +
+ The Space-Time Video Super-Resolution (STVSR) task aims to enhance the visual +quality of videos, by simultaneously performing video frame interpolation (VFI) +and video super-resolution (VSR). However, facing the challenge of the +additional temporal dimension and scale inconsistency, most existing STVSR +methods are complex and inflexible in dynamically modeling different motion +amplitudes. In this work, we find that choosing an appropriate processing scale +achieves remarkable benefits in flow-based feature propagation. We propose a +novel Scale-Adaptive Feature Aggregation (SAFA) network that adaptively selects +sub-networks with different processing scales for individual samples. +Experiments on four public STVSR benchmarks demonstrate that SAFA achieves +state-of-the-art performance. Our SAFA network outperforms recent +state-of-the-art methods such as TMNet and VideoINR by an average improvement +of over 0.5dB on PSNR, while requiring less than half the number of parameters +and only 1/3 computational costs. + +
+
+ comment: WACV2024, 16 pages +
+
+
+
+
+ + ♻ ☆ Point, Segment and Count: A Generalized Framework for Object Counting + + +
+ Class-agnostic object counting aims to count all objects in an image with +respect to example boxes or class names, \emph{a.k.a} few-shot and zero-shot +counting. Current state-of-the-art methods highly rely on density maps to +predict object counts, which lacks model interpretability. In this paper, we +propose a generalized framework for both few-shot and zero-shot object counting +based on detection. Our framework combines the superior advantages of two +foundation models without compromising their zero-shot capability: (\textbf{i}) +SAM to segment all possible objects as mask proposals, and (\textbf{ii}) CLIP +to classify proposals to obtain accurate object counts. However, this strategy +meets the obstacles of efficiency overhead and the small crowded objects that +cannot be localized and distinguished. To address these issues, our framework, +termed PseCo, follows three steps: point, segment, and count. Specifically, we +first propose a class-agnostic object localization to provide accurate but +least point prompts for SAM, which consequently not only reduces computation +costs but also avoids missing small objects. Furthermore, we propose a +generalized object classification that leverages CLIP image/text embeddings as +the classifier, following a hierarchical knowledge distillation to obtain +discriminative classifications among hierarchical mask proposals. Extensive +experimental results on FSC-147 dataset demonstrate that PseCo achieves +state-of-the-art performance in both few-shot/zero-shot object +counting/detection, with additional results on large-scale COCO and LVIS +datasets. The source code is available at +\url{https://github.com/Hzzone/PseCo}. + +
+
+ comment: Fix typos +
+
+
+
+
+ + ♻ ☆ DriveDreamer: Towards Real-world-driven World Models for Autonomous + Driving + + +
+ World models, especially in autonomous driving, are trending and drawing +extensive attention due to their capacity for comprehending driving +environments. The established world model holds immense potential for the +generation of high-quality driving videos, and driving policies for safe +maneuvering. However, a critical limitation in relevant research lies in its +predominant focus on gaming environments or simulated settings, thereby lacking +the representation of real-world driving scenarios. Therefore, we introduce +DriveDreamer, a pioneering world model entirely derived from real-world driving +scenarios. Regarding that modeling the world in intricate driving scenes +entails an overwhelming search space, we propose harnessing the powerful +diffusion model to construct a comprehensive representation of the complex +environment. Furthermore, we introduce a two-stage training pipeline. In the +initial phase, DriveDreamer acquires a deep understanding of structured traffic +constraints, while the subsequent stage equips it with the ability to +anticipate future states. The proposed DriveDreamer is the first world model +established from real-world driving scenarios. We instantiate DriveDreamer on +the challenging nuScenes benchmark, and extensive experiments verify that +DriveDreamer empowers precise, controllable video generation that faithfully +captures the structural constraints of real-world traffic scenarios. +Additionally, DriveDreamer enables the generation of realistic and reasonable +driving policies, opening avenues for interaction and practical applications. + +
+
+ comment: Project Page: https://drivedreamer.github.io +
+
+
+
+
+ + ♻ ☆ LanguageBind: Extending Video-Language Pretraining to N-modality by + Language-based Semantic Alignment ICLR 2024 + + +
+ The video-language (VL) pretraining has achieved remarkable improvement in +multiple downstream tasks. However, the current VL pretraining framework is +hard to extend to multiple modalities (N modalities, N>=3) beyond vision and +language. We thus propose LanguageBind, taking the language as the bind across +different modalities because the language modality is well-explored and +contains rich semantics. Specifically, we freeze the language encoder acquired +by VL pretraining, then train encoders for other modalities with contrastive +learning. As a result, all modalities are mapped to a shared feature space, +implementing multi-modal semantic alignment. While LanguageBind ensures that we +can extend VL modalities to N modalities, we also need a high-quality dataset +with alignment data pairs centered on language. We thus propose VIDAL-10M with +Video, Infrared, Depth, Audio and their corresponding Language, naming as +VIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with +complete semantics rather than truncated segments from long videos, and all the +video, depth, infrared, and audio modalities are aligned to their textual +descriptions. After pretraining on VIDAL-10M, we outperform ImageBind by 5.8% +R@1 on the MSR-VTT dataset with only 15% of the parameters in the zero-shot +video-text retrieval task. Beyond this, our LanguageBind has greatly improved +in the zero-shot video, audio, depth, and infrared understanding tasks. For +instance, LanguageBind surpassing InterVideo by 1.9% on MSR-VTT, 8.8% on MSVD, +6.3% on DiDeMo, and 4.4% on ActivityNet. On the LLVIP and NYU-D datasets, +LanguageBind outperforms ImageBind with 23.8% and 11.1% top-1 accuracy. Code +address: https://github.com/PKU-YuanGroup/LanguageBind. + +
+
+ comment: Under review as a conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ WordArt Designer: User-Driven Artistic Typography Synthesis using Large + Language Models EMNLP 2023 + + +
+ This paper introduces WordArt Designer, a user-driven framework for artistic +typography synthesis, relying on the Large Language Model (LLM). The system +incorporates four key modules: the LLM Engine, SemTypo, StyTypo, and TexTypo +modules. 1) The LLM Engine, empowered by the LLM (e.g., GPT-3.5), interprets +user inputs and generates actionable prompts for the other modules, thereby +transforming abstract concepts into tangible designs. 2) The SemTypo module +optimizes font designs using semantic concepts, striking a balance between +artistic transformation and readability. 3) Building on the semantic layout +provided by the SemTypo module, the StyTypo module creates smooth, refined +images. 4) The TexTypo module further enhances the design's aesthetics through +texture rendering, enabling the generation of inventive textured fonts. +Notably, WordArt Designer highlights the fusion of generative AI with artistic +typography. Experience its capabilities on ModelScope: +https://www.modelscope.cn/studios/WordArt/WordArt. + +
+
+ comment: Accepted by EMNLP 2023, 10 pages, 11 figures, 1 table, the system is + at https://www.modelscope.cn/studios/WordArt/WordArt +
+
+
+
+
+ + ♻ ☆ FedSoL: Bridging Global Alignment and Local Generality in Federated + Learning + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +client data distributions are heterogeneous. Many previous FL algorithms have +addressed this issue by introducing various proximal restrictions. These +restrictions aim to encourage global alignment by constraining the deviation of +local learning from the global objective. However, they inherently limit local +learning by interfering with the original local objectives. Recently, an +alternative approach has emerged to improve local learning generality. By +obtaining local models within a smooth loss landscape, this approach mitigates +conflicts among different local objectives of the clients. Yet, it does not +ensure stable global alignment, as local learning does not take the global +objective into account. In this study, we propose Federated Stability on +Learning (FedSoL), which combines both the concepts of global alignment and +local generality. In FedSoL, the local learning seeks a parameter region robust +against proximal perturbations. This strategy introduces an implicit proximal +restriction effect in local learning while maintaining the original local +objective for parameter update. Our experiments show that FedSoL consistently +achieves state-of-the-art performance on various setups. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ DeepSimHO: Stable Pose Estimation for Hand-Object Interaction via + Physics Simulation NeurIPS 2023 + + +
+ This paper addresses the task of 3D pose estimation for a hand interacting +with an object from a single image observation. When modeling hand-object +interaction, previous works mainly exploit proximity cues, while overlooking +the dynamical nature that the hand must stably grasp the object to counteract +gravity and thus preventing the object from slipping or falling. These works +fail to leverage dynamical constraints in the estimation and consequently often +produce unstable results. Meanwhile, refining unstable configurations with +physics-based reasoning remains challenging, both by the complexity of contact +dynamics and by the lack of effective and efficient physics inference in the +data-driven learning framework. To address both issues, we present DeepSimHO: a +novel deep-learning pipeline that combines forward physics simulation and +backward gradient approximation with a neural network. Specifically, for an +initial hand-object pose estimated by a base network, we forward it to a +physics simulator to evaluate its stability. However, due to non-smooth contact +geometry and penetration, existing differentiable simulators can not provide +reliable state gradient. To remedy this, we further introduce a deep network to +learn the stability evaluation process from the simulator, while smoothly +approximating its gradient and thus enabling effective back-propagation. +Extensive experiments show that our method noticeably improves the stability of +the estimation and achieves superior efficiency over test-time optimization. +The code is available at https://github.com/rongakowang/DeepSimHO. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ DocPedia: Unleashing the Power of Large Multimodal Model in the + Frequency Domain for Versatile Document Understanding + + +
+ This work presents DocPedia, a novel large multimodal model (LMM) for +versatile OCR-free document understanding, capable of parsing images up to +2,560$\times$2,560 resolution. Unlike existing work either struggle with +high-resolution documents or give up the large language model thus vision or +language ability constrained, our DocPedia directly processes visual input in +the frequency domain rather than the pixel space. The unique characteristic +enables DocPedia to capture a greater amount of visual and textual information +using a limited number of visual tokens. To consistently enhance both +perception and comprehension abilities of our model, we develop a dual-stage +training strategy and enrich instructions/annotations of all training tasks +covering multiple document types. Extensive quantitative and qualitative +experiments conducted on various publicly available benchmarks confirm the +mutual benefits of jointly learning perception and comprehension tasks. The +results provide further evidence of the effectiveness and superior performance +of our DocPedia over other methods. + +
+
+
+
+
+ + ♻ ☆ Animatable 3D Gaussians for High-fidelity Synthesis of Human Motions + + +
+ We present a novel animatable 3D Gaussian model for rendering high-fidelity +free-view human motions in real time. Compared to existing NeRF-based methods, +the model owns better capability in synthesizing high-frequency details without +the jittering problem across video frames. The core of our model is a novel +augmented 3D Gaussian representation, which attaches each Gaussian with a +learnable code. The learnable code serves as a pose-dependent appearance +embedding for refining the erroneous appearance caused by geometric +transformation of Gaussians, based on which an appearance refinement model is +learned to produce residual Gaussian properties to match the appearance in +target pose. To force the Gaussians to learn the foreground human only without +background interference, we further design a novel alpha loss to explicitly +constrain the Gaussians within the human body. We also propose to jointly +optimize the human joint parameters to improve the appearance accuracy. The +animatable 3D Gaussian model can be learned with shallow MLPs, so new human +motions can be synthesized in real time (66 fps on avarage). Experiments show +that our model has superior performance over NeRF-based methods. + +
+
+ comment: Some experiment data is wrong. The expression of the paper in + introduction and abstract is incorrect. Some graphs have inappropriate + descriptions +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ BioLORD-2023: Semantic Textual Representations Fusing LLM and Clinical + Knowledge Graph Insights + + +
+ In this study, we investigate the potential of Large Language Models to +complement biomedical knowledge graphs in the training of semantic models for +the biomedical and clinical domains. Drawing on the wealth of the UMLS +knowledge graph and harnessing cutting-edge Large Language Models, we propose a +new state-of-the-art approach for obtaining high-fidelity representations of +biomedical concepts and sentences, consisting of three steps: an improved +contrastive learning phase, a novel self-distillation phase, and a weight +averaging phase. Through rigorous evaluations via the extensive BioLORD testing +suite and diverse downstream tasks, we demonstrate consistent and substantial +performance improvements over the previous state of the art (e.g. +2pts on +MedSTS, +2.5pts on MedNLI-S, +6.1pts on EHR-Rel-B). Besides our new +state-of-the-art biomedical model for English, we also distill and release a +multilingual model compatible with 50+ languages and finetuned on 7 European +languages. Many clinical pipelines can benefit from our latest models. Our new +multilingual model enables a range of languages to benefit from our +advancements in biomedical semantic representation learning, opening a new +avenue for bioinformatics researchers around the world. As a result, we hope to +see BioLORD-2023 becoming a precious tool for future biomedical applications. + +
+
+ comment: Preprint of upcoming journal article +
+
+
+
+
+ + ☆ SEINE: SEgment-based Indexing for NEural information retrieval + + +
+ Many early neural Information Retrieval (NeurIR) methods are re-rankers that +rely on a traditional first-stage retriever due to expensive query time +computations. Recently, representation-based retrievers have gained much +attention, which learns query representation and document representation +separately, making it possible to pre-compute document representations offline +and reduce the workload at query time. Both dense and sparse +representation-based retrievers have been explored. However, these methods +focus on finding the representation that best represents a text (aka metric +learning) and the actual retrieval function that is responsible for similarity +matching between query and document is kept at a minimum by using dot product. +One drawback is that unlike traditional term-level inverted index, the index +formed by these embeddings cannot be easily re-used by another retrieval +method. Another drawback is that keeping the interaction at minimum hurts +retrieval effectiveness. On the contrary, interaction-based retrievers are +known for their better retrieval effectiveness. In this paper, we propose a +novel SEgment-based Neural Indexing method, SEINE, which provides a general +indexing framework that can flexibly support a variety of interaction-based +neural retrieval methods. We emphasize on a careful decomposition of common +components in existing neural retrieval methods and propose to use +segment-level inverted index to store the atomic query-document interaction +values. Experiments on LETOR MQ2007 and MQ2008 datasets show that our indexing +method can accelerate multiple neural retrieval methods up to 28-times faster +without sacrificing much effectiveness. + +
+
+
+
+
+ + ☆ A Social-aware Gaussian Pre-trained Model for Effective Cold-start + Recommendation + + +
+ The use of pre-training is an emerging technique to enhance a neural model's +performance, which has been shown to be effective for many neural language +models such as BERT. This technique has also been used to enhance the +performance of recommender systems. In such recommender systems, pre-training +models are used to learn a better initialisation for both users and items. +However, recent existing pre-trained recommender systems tend to only +incorporate the user interaction data at the pre-training stage, making it +difficult to deliver good recommendations, especially when the interaction data +is sparse. To alleviate this common data sparsity issue, we propose to +pre-train the recommendation model not only with the interaction data but also +with other available information such as the social relations among users, +thereby providing the recommender system with a better initialisation compared +with solely relying on the user interaction data. We propose a novel +recommendation model, the Social-aware Gaussian Pre-trained model (SGP), which +encodes the user social relations and interaction data at the pre-training +stage in a Graph Neural Network (GNN). Afterwards, in the subsequent +fine-tuning stage, our SGP model adopts a Gaussian Mixture Model (GMM) to +factorise these pre-trained embeddings for further training, thereby benefiting +the cold-start users from these pre-built social relations. Our extensive +experiments on three public datasets show that, in comparison to 16 competitive +baselines, our SGP model significantly outperforms the best baseline by upto +7.7% in terms of NDCG@10. In addition, we show that SGP permits to effectively +alleviate the cold-start problem, especially when users newly register to the +system through their friends' suggestions. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Justifiable Artificial Intelligence: Engineering Large Language Models + for Legal Applications + + +
+ In this work, I discuss how Large Language Models can be applied in the legal +domain, circumventing their current drawbacks. Despite their large success and +acceptance, their lack of explainability hinders legal experts to trust in +their output, and this happens rightfully so. However, in this paper, I argue +in favor of a new view, Justifiable Artificial Intelligence, instead of +focusing on Explainable Artificial Intelligence. I discuss in this paper how +gaining evidence for and against a Large Language Model's output may make their +generated texts more trustworthy - or hold them accountable for misinformation. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Two Approaches to the Identity of Processes in BFO + + +
+ This paper aims to explore processes and their identity with a focus on the +upper ontology Basic Formal Ontology (BFO). We begin with a classification +based on two basic classes of changes of independent continuants: changes with +respect to a single specifically dependent continuant thereof or with respect +to the spatial region that its parts occupy. We accordingly distinguish two +kinds of simple processes: specifically dependent continuant changes and +spatial changes. Next, we investigate a compositional approach to the identity +of processes: the identity of any process is determined by the identity of the +simple processes that compose them. Then, we consider a causal approach to the +identity of processes with recourse to a dispositional view of processes +according to which any process is a realization of some disposition. We also +examine assumptions on which these two approaches to the identity of processes +are based. + +
+
+
+
+
+ + ☆ Experimental Analysis of Large-scale Learnable Vector Storage + Compression + + +
+ Learnable embedding vector is one of the most important applications in +machine learning, and is widely used in various database-related domains. +However, the high dimensionality of sparse data in recommendation tasks and the +huge volume of corpus in retrieval-related tasks lead to a large memory +consumption of the embedding table, which poses a great challenge to the +training and deployment of models. Recent research has proposed various methods +to compress the embeddings at the cost of a slight decrease in model quality or +the introduction of other overheads. Nevertheless, the relative performance of +these methods remains unclear. Existing experimental comparisons only cover a +subset of these methods and focus on limited metrics. In this paper, we perform +a comprehensive comparative analysis and experimental evaluation of embedding +compression. We introduce a new taxonomy that categorizes these techniques +based on their characteristics and methodologies, and further develop a modular +benchmarking framework that integrates 14 representative methods. Under a +uniform test environment, our benchmark fairly evaluates each approach, +presents their strengths and weaknesses under different memory budgets, and +recommends the best method based on the use case. In addition to providing +useful guidelines, our study also uncovers the limitations of current methods +and suggests potential directions for future research. + +
+
+
+
+
+ + ☆ Boot and Switch: Alternating Distillation for Zero-Shot Dense Retrieval EMNLP 2023 + + +
+ Neural 'dense' retrieval models are state of the art for many datasets, +however these models often exhibit limited domain transfer ability. Existing +approaches to adaptation are unwieldy, such as requiring explicit supervision, +complex model architectures, or massive external models. We present +$\texttt{ABEL}$, a simple but effective unsupervised method to enhance passage +retrieval in zero-shot settings. Our technique follows a straightforward loop: +a dense retriever learns from supervision signals provided by a reranker, and +subsequently, the reranker is updated based on feedback from the improved +retriever. By iterating this loop, the two components mutually enhance one +another's performance. Experimental results demonstrate that our unsupervised +$\texttt{ABEL}$ model outperforms both leading supervised and unsupervised +retrievers on the BEIR benchmark. Meanwhile, it exhibits strong adaptation +abilities to tasks and domains that were unseen during training. By either +fine-tuning $\texttt{ABEL}$ on labelled data or integrating it with existing +supervised dense retrievers, we achieve state-of-the-art +results.\footnote{Source code is available at +\url{https://github.com/Fantabulous-J/BootSwitch}.} + +
+
+ comment: Accepted by EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Noisy Self-Training with Synthetic Queries for Dense Retrieval EMNLP 2023 + + +
+ Although existing neural retrieval models reveal promising results when +training data is abundant and the performance keeps improving as training data +increases, collecting high-quality annotated data is prohibitively costly. To +this end, we introduce a novel noisy self-training framework combined with +synthetic queries, showing that neural retrievers can be improved in a +self-evolution manner with no reliance on any external models. Experimental +results show that our method improves consistently over existing methods on +both general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval +benchmarks. Extra analysis on low-resource settings reveals that our method is +data efficient and outperforms competitive baselines, with as little as 30% of +labelled training data. Further extending the framework for reranker training +demonstrates that the proposed method is general and yields additional gains on +tasks of diverse domains.\footnote{Source code is available at +\url{https://github.com/Fantabulous-J/Self-Training-DPR}} + +
+
+ comment: Accepted by EMNLP 2023 Findings +
+
+
+
+
+ + ☆ UFIN: Universal Feature Interaction Network for Multi-Domain + Click-Through Rate Prediction + + +
+ Click-Through Rate (CTR) prediction, which aims to estimate the probability +of a user clicking on an item, is a key task in online advertising. Numerous +existing CTR models concentrate on modeling the feature interactions within a +solitary domain, thereby rendering them inadequate for fulfilling the +requisites of multi-domain recommendations in real industrial scenarios. Some +recent approaches propose intricate architectures to enhance knowledge sharing +and augment model training across multiple domains. However, these approaches +encounter difficulties when being transferred to new recommendation domains, +owing to their reliance on the modeling of ID features (e.g., item id). To +address the above issue, we propose the Universal Feature Interaction Network +(UFIN) approach for CTR prediction. UFIN exploits textual data to learn +universal feature interactions that can be effectively transferred across +diverse domains. For learning universal feature representations, we regard the +text and feature as two different modalities and propose an encoder-decoder +network founded on a Large Language Model (LLM) to enforce the transfer of data +from the text modality to the feature modality. Building upon the above +foundation, we further develop a mixtureof-experts (MoE) enhanced adaptive +feature interaction model to learn transferable collaborative patterns across +multiple domains. Furthermore, we propose a multi-domain knowledge distillation +framework to enhance feature interaction learning. Based on the above methods, +UFIN can effectively bridge the semantic gap to learn common knowledge across +various domains, surpassing the constraints of ID-based models. Extensive +experiments conducted on eight datasets show the effectiveness of UFIN, in both +multidomain and cross-platform settings. Our code is available at +https://github.com/RUCAIBox/UFIN. + +
+
+
+
+
+ + ☆ Robust Basket Recommendation via Noise-tolerated Graph Contrastive + Learning CIKM 2023 + + +
+ The growth of e-commerce has seen a surge in popularity of platforms like +Amazon, eBay, and Taobao. This has given rise to a unique shopping behavior +involving baskets - sets of items purchased together. As a less studied +interaction mode in the community, the question of how should shopping basket +complement personalized recommendation systems remains under-explored. While +previous attempts focused on jointly modeling user purchases and baskets, the +distinct semantic nature of these elements can introduce noise when directly +integrated. This noise negatively impacts the model's performance, further +exacerbated by significant noise within both user and basket behaviors. + In order to cope with the above difficulties, we propose a novel Basket +recommendation framework via Noise-tolerated Contrastive Learning, named BNCL, +to handle the noise existing in the cross-behavior integration and +within-behavior modeling. First, we represent the basket-item interactions as +the hypergraph to model the complex basket behavior, where all items appearing +in the same basket are treated as a single hyperedge. Second, cross-behavior +contrastive learning is designed to suppress the noise during the fusion of +diverse behaviors. Next, to further inhibit the within-behavior noise of the +user and basket interactions, we propose to exploit invariant properties of the +recommenders w.r.t augmentations through within-behavior contrastive learning. +A novel consistency-aware augmentation approach is further designed to better +identify noisy interactions with the consideration of the above two types of +interactions. Our framework BNCL offers a generic training paradigm that is +applicable to different backbones. Extensive experiments on three shopping +transaction datasets verify the effectiveness of our proposed method. Our code +is available. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ☆ The Graph Convolutional Network with Multi-representation Alignment for + Drug Synergy Prediction + + +
+ Drug combination refers to the use of two or more drugs to treat a specific +disease at the same time. It is currently the mainstream way to treat complex +diseases. Compared with single drugs, drug combinations have better efficacy +and can better inhibit toxicity and drug resistance. The computational model +based on deep learning concatenates the representation of multiple drugs and +the corresponding cell line feature as input, and the output is whether the +drug combination can have an inhibitory effect on the cell line. However, this +strategy of concatenating multiple representations has the following defects: +the alignment of drug representation and cell line representation is ignored, +resulting in the synergistic relationship not being reflected positionally in +the embedding space. Moreover, the alignment measurement function in deep +learning cannot be suitable for drug synergy prediction tasks due to +differences in input types. Therefore, in this work, we propose a graph +convolutional network with multi-representation alignment (GCNMRA) for +predicting drug synergy. In the GCNMRA model, we designed a +multi-representation alignment function suitable for the drug synergy +prediction task so that the positional relationship between drug +representations and cell line representation is reflected in the embedding +space. In addition, the vector modulus of drug representations and cell line +representation is considered to improve the accuracy of calculation results and +accelerate model convergence. Finally, many relevant experiments were run on +multiple drug synergy datasets to verify the effectiveness of the above +innovative elements and the excellence of the GCNMRA model. + +
+
+ comment: 14 pages; +
+
+
+
+
+ + ♻ ☆ Thoroughly Modeling Multi-domain Pre-trained Recommendation as Language + + +
+ With the thriving of pre-trained language model (PLM) widely verified in +various of NLP tasks, pioneer efforts attempt to explore the possible +cooperation of the general textual information in PLM with the personalized +behavioral information in user historical behavior sequences to enhance +sequential recommendation (SR). However, despite the commonalities of input +format and task goal, there are huge gaps between the behavioral and textual +information, which obstruct thoroughly modeling SR as language modeling via +PLM. To bridge the gap, we propose a novel Unified pre-trained language model +enhanced sequential recommendation (UPSR), aiming to build a unified +pre-trained recommendation model for multi-domain recommendation tasks. We +formally design five key indicators, namely naturalness, domain consistency, +informativeness, noise & ambiguity, and text length, to guide the text-item +adaptation and behavior sequence-text sequence adaptation differently for +pre-training and fine-tuning stages, which are essential but under-explored by +previous works. In experiments, we conduct extensive evaluations on seven +datasets with both tuning and zero-shot settings and achieve the overall best +performance. Comprehensive model analyses also provide valuable insights for +behavior modeling via PLM, shedding light on large pre-trained recommendation +models. The source codes will be released in the future. + +
+
+
+
+
+ + ♻ ☆ AI-Generated Images Introduce Invisible Relevance Bias to Text-Image + Retrieval + + +
+ With the advancement of generation models, AI-generated content (AIGC) is +becoming more realistic, flooding the Internet. A recent study suggests that +this phenomenon has elevated the issue of source bias in text retrieval for web +searches. Specifically, neural retrieval models tend to rank generated texts +higher than human-written texts. In this paper, we extend the study of this +bias to cross-modal retrieval. Firstly, we successfully construct a suitable +benchmark to explore the existence of the bias. Subsequent extensive +experiments on this benchmark reveal that AI-generated images introduce an +invisible relevance bias to text-image retrieval models. Specifically, our +experiments show that text-image retrieval models tend to rank the AI-generated +images higher than the real images, even though the AI-generated images do not +exhibit more visually relevant features to the query than real images. This +invisible relevance bias is prevalent across retrieval models with varying +training data and architectures. Furthermore, our subsequent exploration +reveals that the inclusion of AI-generated images in the training data of the +retrieval models exacerbates the invisible relevance bias. The above phenomenon +triggers a vicious cycle, which makes the invisible relevance bias become more +and more serious. To elucidate the potential causes of invisible relevance and +address the aforementioned issues, we introduce an effective training method +aimed at alleviating the invisible relevance bias. Subsequently, we apply our +proposed debiasing method to retroactively identify the causes of invisible +relevance, revealing that the AI-generated images induce the image encoder to +embed additional information into their representation. This information +exhibits a certain consistency across generated images with different semantics +and can make the retriever estimate a higher relevance score. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ Prompt Tuning on Graph-augmented Low-resource Text Classification + + +
+ Text classification is a fundamental problem in information retrieval with +many real-world applications, such as predicting the topics of online articles +and the categories of e-commerce product descriptions. However, low-resource +text classification, with no or few labeled samples, presents a serious concern +for supervised learning. Meanwhile, many text data are inherently grounded on a +network structure, such as a hyperlink/citation network for online articles, +and a user-item purchase network for e-commerce products. These graph +structures capture rich semantic relationships, which can potentially augment +low-resource text classification. In this paper, we propose a novel model +called Graph-Grounded Pre-training and Prompting (G2P2) to address low-resource +text classification in a two-pronged approach. During pre-training, we propose +three graph interaction-based contrastive strategies to jointly pre-train a +graph-text model; during downstream classification, we explore handcrafted +discrete prompts and continuous prompt tuning for the jointly pre-trained model +to achieve zero- and few-shot classification, respectively. Moreover, we +explore the possibility of employing continuous prompt tuning for zero-shot +inference. Specifically, we aim to generalize continuous prompts to unseen +classes while leveraging a set of base classes. To this end, we extend G2P2 +into G2P2$^*$, hinging on a new architecture of conditional prompt tuning. +Extensive experiments on four real-world datasets demonstrate the strength of +G2P2 in zero- and few-shot low-resource text classification tasks, and +illustrate the advantage of G2P2$^*$ in dealing with unseen classes. + +
+
+ comment: 14 pages, journal under review. arXiv admin note: substantial text + overlap with arXiv:2305.03324 +
+
+
+
+
+ + ♻ ☆ LM-Cocktail: Resilient Tuning of Language Models via Model Merging + + +
+ The pre-trained language models are continually fine-tuned to better support +downstream applications. However, this operation may result in significant +performance degeneration on general tasks beyond the targeted domain. To +overcome this problem, we propose a novel method which enables the fine-tuned +model to stay resilient in general perspectives. Our method is conducted in the +form of model merging (namely LM-Cocktail), where the fine-tuned language model +is merged with the pre-trained base model or the peer models from other domains +through weighted average. Despite simplicity, LM-Cocktail is surprisingly +effective: the resulted model is able to achieve a strong empirical performance +in the whole scope of general tasks while preserving a superior capacity in its +targeted domain. We conduct comprehensive experiments with LLama and BGE model +on popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the +efficacy of our proposed method. The code and checkpoints are available at +https://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail. + +
+
+
+
+
+ + ♻ ☆ DSI++: Updating Transformer Memory with New Documents EMNLP 2023 + + +
+ Differentiable Search Indices (DSIs) encode a corpus of documents in model +parameters and use the same model to answer user queries directly. Despite the +strong performance of DSI models, deploying them in situations where the corpus +changes over time is computationally expensive because reindexing the corpus +requires re-training the model. In this work, we introduce DSI++, a continual +learning challenge for DSI to incrementally index new documents while being +able to answer queries related to both previously and newly indexed documents. +Across different model scales and document identifier representations, we show +that continual indexing of new documents leads to considerable forgetting of +previously indexed documents. We also hypothesize and verify that the model +experiences forgetting events during training, leading to unstable learning. To +mitigate these issues, we investigate two approaches. The first focuses on +modifying the training dynamics. Flatter minima implicitly alleviate +forgetting, so we optimize for flatter loss basins and show that the model +stably memorizes more documents ($+12\%$). Next, we introduce a generative +memory to sample pseudo-queries for documents and supplement them during +continual indexing to prevent forgetting for the retrieval task. Extensive +experiments on novel continual indexing benchmarks based on Natural Questions +(NQ) and MS MARCO demonstrate that our proposed solution mitigates forgetting +significantly. Concretely, it improves the average Hits@10 by $+21.1\%$ over +competitive baselines for NQ and requires $6$ times fewer model updates +compared to re-training the DSI model for incrementally indexing five corpora +in a sequence. + +
+
+ comment: Accepted at EMNLP 2023 main conference +
+
+
+
+
+
+
+
+ + Machine Learning 156 + +
+
+
+ + ☆ Test-time Adaptation of Discriminative Models via Diffusion Generative + Feedback NeurIPS 2023 + + +
+ The advancements in generative modeling, particularly the advent of diffusion +models, have sparked a fundamental question: how can these models be +effectively used for discriminative tasks? In this work, we find that +generative models can be great test-time adapters for discriminative models. +Our method, Diffusion-TTA, adapts pre-trained discriminative models such as +image classifiers, segmenters and depth predictors, to each unlabelled example +in the test set using generative feedback from a diffusion model. We achieve +this by modulating the conditioning of the diffusion model using the output of +the discriminative model. We then maximize the image likelihood objective by +backpropagating the gradients to discriminative model's parameters. We show +Diffusion-TTA significantly enhances the accuracy of various large-scale +pre-trained discriminative models, such as, ImageNet classifiers, CLIP models, +image pixel labellers and image depth predictors. Diffusion-TTA outperforms +existing test-time adaptation methods, including TTT-MAE and TENT, and +particularly shines in online adaptation setups, where the discriminative model +is continually adapted to each example in the test set. We provide access to +code, results, and visualizations on our website: +https://diffusion-tta.github.io/. + +
+
+ comment: Accepted at NeurIPS 2023 Webpage with Code: + https://diffusion-tta.github.io/ +
+
+
+
+
+ + ☆ How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for + Vision LLMs SC + + +
+ This work focuses on the potential of Vision LLMs (VLLMs) in visual +reasoning. Different from prior studies, we shift our focus from evaluating +standard performance to introducing a comprehensive safety evaluation suite, +covering both out-of-distribution (OOD) generalization and adversarial +robustness. For the OOD evaluation, we present two novel VQA datasets, each +with one variant, designed to test model performance under challenging +conditions. In exploring adversarial robustness, we propose a straightforward +attack strategy for misleading VLLMs to produce visual-unrelated responses. +Moreover, we assess the efficacy of two jailbreaking strategies, targeting +either the vision or language component of VLLMs. Our evaluation of 21 diverse +models, ranging from open-source VLLMs to GPT-4V, yields interesting +observations: 1) Current VLLMs struggle with OOD texts but not images, unless +the visual information is limited; and 2) These VLLMs can be easily misled by +deceiving vision encoders only, and their vision-language training often +compromise safety protocols. We release this safety evaluation suite at +https://github.com/UCSC-VLAA/vllm-safety-benchmark. + +
+
+ comment: H.T., C.C., and Z.W. contribute equally. Work done during H.T. and + Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC +
+
+
+
+
+ + ☆ On Bringing Robots Home + + +
+ Throughout history, we have successfully integrated various machines into our +homes. Dishwashers, laundry machines, stand mixers, and robot vacuums are a few +recent examples. However, these machines excel at performing only a single task +effectively. The concept of a "generalist machine" in homes - a domestic +assistant that can adapt and learn from our needs, all while remaining +cost-effective - has long been a goal in robotics that has been steadily +pursued for decades. In this work, we initiate a large-scale effort towards +this goal by introducing Dobb-E, an affordable yet versatile general-purpose +system for learning robotic manipulation within household settings. Dobb-E can +learn a new task with only five minutes of a user showing it how to do it, +thanks to a demonstration collection tool ("The Stick") we built out of cheap +parts and iPhones. We use the Stick to collect 13 hours of data in 22 homes of +New York City, and train Home Pretrained Representations (HPR). Then, in a +novel home environment, with five minutes of demonstrations and fifteen minutes +of adapting the HPR model, we show that Dobb-E can reliably solve the task on +the Stretch, a mobile robot readily available on the market. Across roughly 30 +days of experimentation in homes of New York City and surrounding areas, we +test our system in 10 homes, with a total of 109 tasks in different +environments, and finally achieve a success rate of 81%. Beyond success +percentages, our experiments reveal a plethora of unique challenges absent or +ignored in lab robotics. These range from effects of strong shadows, to +variable demonstration quality by non-expert users. With the hope of +accelerating research on home robots, and eventually seeing robot butlers in +every home, we open-source Dobb-E software stack and models, our data, and our +hardware designs at https://dobb-e.com + +
+
+ comment: Project website and videos are available at https://dobb-e.com, + technical documentation for getting started is available at + https://docs.dobb-e.com, and code is released at + https://github.com/notmahi/dobb-e +
+
+
+
+
+ + ☆ Have we built machines that think like people? + + +
+ A chief goal of artificial intelligence is to build machines that think like +people. Yet it has been argued that deep neural network architectures fail to +accomplish this. Researchers have asserted these models' limitations in the +domains of causal reasoning, intuitive physics, and intuitive psychology. Yet +recent advancements, namely the rise of large language models, particularly +those designed for visual processing, have rekindled interest in the potential +to emulate human-like cognitive abilities. This paper evaluates the current +state of vision-based large language models in the domains of intuitive +physics, causal reasoning, and intuitive psychology. Through a series of +controlled experiments, we investigate the extent to which these modern models +grasp complex physical interactions, causal relationships, and intuitive +understanding of others' preferences. Our findings reveal that, while these +models demonstrate a notable proficiency in processing and interpreting visual +data, they still fall short of human capabilities in these areas. The models +exhibit a rudimentary understanding of physical laws and causal relationships, +but their performance is hindered by a lack of deeper insights-a key aspect of +human cognition. Furthermore, in tasks requiring an intuitive theory of mind, +the models fail altogether. Our results emphasize the need for integrating more +robust mechanisms for understanding causality, physical dynamics, and social +cognition into modern-day, vision-based language models, and point out the +importance of cognitively-inspired benchmarks. + +
+
+
+
+
+ + ☆ Interactive Autonomous Navigation with Internal State Inference and + Interactivity Estimation + + +
+ Deep reinforcement learning (DRL) provides a promising way for intelligent +agents (e.g., autonomous vehicles) to learn to navigate complex scenarios. +However, DRL with neural networks as function approximators is typically +considered a black box with little explainability and often suffers from +suboptimal performance, especially for autonomous navigation in highly +interactive multi-agent environments. To address these issues, we propose three +auxiliary tasks with spatio-temporal relational reasoning and integrate them +into the standard DRL framework, which improves the decision making performance +and provides explainable intermediate indicators. We propose to explicitly +infer the internal states (i.e., traits and intentions) of surrounding agents +(e.g., human drivers) as well as to predict their future trajectories in the +situations with and without the ego agent through counterfactual reasoning. +These auxiliary tasks provide additional supervision signals to infer the +behavior patterns of other interactive agents. Multiple variants of framework +integration strategies are compared. We also employ a spatio-temporal graph +neural network to encode relations between dynamic entities, which enhances +both internal state inference and decision making of the ego agent. Moreover, +we propose an interactivity estimation mechanism based on the difference +between predicted trajectories in these two situations, which indicates the +degree of influence of the ego agent on other agents. To validate the proposed +method, we design an intersection driving simulator based on the Intelligent +Intersection Driver Model (IIDM) that simulates vehicles and pedestrians. Our +approach achieves robust and state-of-the-art performance in terms of standard +evaluation metrics and provides explainable intermediate indicators (i.e., +internal states, and interactivity scores) for decision making. + +
+
+ comment: 18 pages, 14 figures +
+
+
+
+
+ + ☆ MAST: Model-Agnostic Sparsified Training + + +
+ We introduce a novel optimization problem formulation that departs from the +conventional way of minimizing machine learning model loss as a black-box +function. Unlike traditional formulations, the proposed approach explicitly +incorporates an initially pre-trained model and random sketch operators, +allowing for sparsification of both the model and gradient during training. We +establish insightful properties of the proposed objective function and +highlight its connections to the standard formulation. Furthermore, we present +several variants of the Stochastic Gradient Descent (SGD) method adapted to the +new problem formulation, including SGD with general sampling, a distributed +version, and SGD with variance reduction techniques. We achieve tighter +convergence rates and relax assumptions, bridging the gap between theoretical +principles and practical applications, covering several important techniques +such as Dropout and Sparse training. This work presents promising opportunities +to enhance the theoretical understanding of model training through a +sparsification-aware optimization approach. + +
+
+ comment: 58 pages, 5 figures +
+
+
+
+
+ + ☆ Transformer-QEC: Quantum Error Correction Code Decoding with + Transferable Transformers FAST + + +
+ Quantum computing has the potential to solve problems that are intractable +for classical systems, yet the high error rates in contemporary quantum devices +often exceed tolerable limits for useful algorithm execution. Quantum Error +Correction (QEC) mitigates this by employing redundancy, distributing quantum +information across multiple data qubits and utilizing syndrome qubits to +monitor their states for errors. The syndromes are subsequently interpreted by +a decoding algorithm to identify and correct errors in the data qubits. This +task is complex due to the multiplicity of error sources affecting both data +and syndrome qubits as well as syndrome extraction operations. Additionally, +identical syndromes can emanate from different error sources, necessitating a +decoding algorithm that evaluates syndromes collectively. Although machine +learning (ML) decoders such as multi-layer perceptrons (MLPs) and convolutional +neural networks (CNNs) have been proposed, they often focus on local syndrome +regions and require retraining when adjusting for different code distances. We +introduce a transformer-based QEC decoder which employs self-attention to +achieve a global receptive field across all input syndromes. It incorporates a +mixed loss training approach, combining both local physical error and global +parity label losses. Moreover, the transformer architecture's inherent +adaptability to variable-length inputs allows for efficient transfer learning, +enabling the decoder to adapt to varying code distances without retraining. + Evaluation on six code distances and ten different error configurations +demonstrates that our model consistently outperforms non-ML decoders, such as +Union Find (UF) and Minimum Weight Perfect Matching (MWPM), and other ML +decoders, thereby achieving best logical error rates. Moreover, the transfer +learning can save over 10x of training cost. + +
+
+ comment: Accepted to ICCAD 2023, FAST ML for Science Workshop; 7 pages, 8 + figures +
+
+
+
+
+ + ☆ XLB: Distributed Multi-GPU Lattice Boltzmann Simulation Framework for + Differentiable Scientific Machine Learning + + +
+ The lattice Boltzmann method (LBM) has emerged as a prominent technique for +solving fluid dynamics problems due to its algorithmic potential for +computational scalability. We introduce XLB framework, a Python-based +differentiable LBM library which harnesses the capabilities of the JAX +framework. The architecture of XLB is predicated upon ensuring accessibility, +extensibility, and computational performance, enabling scaling effectively +across CPU, multi-GPU, and distributed multi-GPU systems. The framework can be +readily augmented with novel boundary conditions, collision models, or +simulation capabilities. XLB offers the unique advantage of integration with +JAX's extensive machine learning echosystem, and the ability to utilize +automatic differentiation for tackling physics-based machine learning, +optimization, and inverse problems. XLB has been successfully scaled to handle +simulations with billions of cells, achieving giga-scale lattice updates per +second. XLB is released under the permissive Apache-2.0 license and is +available on GitHub at https://github.com/Autodesk/XLB. + +
+
+
+
+
+ + ☆ MEDITRON-70B: Scaling Medical Pretraining for Large Language Models + + +
+ Large language models (LLMs) can potentially democratize access to medical +knowledge. While many efforts have been made to harness and improve LLMs' +medical knowledge and reasoning capacities, the resulting models are either +closed-source (e.g., PaLM, GPT-4) or limited in scale (<= 13B parameters), +which restricts their abilities. In this work, we improve access to large-scale +medical LLMs by releasing MEDITRON: a suite of open-source LLMs with 7B and 70B +parameters adapted to the medical domain. MEDITRON builds on Llama-2 (through +our adaptation of Nvidia's Megatron-LM distributed trainer), and extends +pretraining on a comprehensively curated medical corpus, including selected +PubMed articles, abstracts, and internationally-recognized medical guidelines. +Evaluations using four major medical benchmarks show significant performance +gains over several state-of-the-art baselines before and after task-specific +finetuning. Overall, MEDITRON achieves a 6% absolute performance gain over the +best public baseline in its parameter class and 3% over the strongest baseline +we finetuned from Llama-2. Compared to closed-source LLMs, MEDITRON-70B +outperforms GPT-3.5 and Med-PaLM and is within 5% of GPT-4 and 10% of +Med-PaLM-2. We release our code for curating the medical pretraining corpus and +the MEDITRON model weights to drive open-source development of more capable +medical LLMs. + +
+
+
+
+
+ + ☆ A Survey on Vulnerability of Federated Learning: A Learning Algorithm + Perspective + + +
+ This review paper takes a comprehensive look at malicious attacks against FL, +categorizing them from new perspectives on attack origins and targets, and +providing insights into their methodology and impact. In this survey, we focus +on threat models targeting the learning process of FL systems. Based on the +source and target of the attack, we categorize existing threat models into four +types, Data to Model (D2M), Model to Data (M2D), Model to Model (M2M) and +composite attacks. For each attack type, we discuss the defense strategies +proposed, highlighting their effectiveness, assumptions and potential areas for +improvement. Defense strategies have evolved from using a singular metric to +excluding malicious clients, to employing a multifaceted approach examining +client models at various phases. In this survey paper, our research indicates +that the to-learn data, the learning gradients, and the learned model at +different stages all can be manipulated to initiate malicious attacks that +range from undermining model performance, reconstructing private local data, +and to inserting backdoors. We have also seen these threat are becoming more +insidious. While earlier studies typically amplified malicious gradients, +recent endeavors subtly alter the least significant weights in local models to +bypass defense measures. This literature review provides a holistic +understanding of the current FL threat landscape and highlights the importance +of developing robust, efficient, and privacy-preserving defenses to ensure the +safe and trusted adoption of FL in real-world applications. + +
+
+ comment: https://github.com/Rand2AI/Awesome-Vulnerability-of-Federated-Learning +
+
+
+
+
+ + ☆ Metric Space Magnitude for Evaluating Unsupervised Representation + Learning + + +
+ The magnitude of a metric space was recently established as a novel +invariant, providing a measure of the `effective size' of a space across +multiple scales. By capturing both geometrical and topological properties of +data, magnitude is poised to address challenges in unsupervised representation +learning tasks. We formalise a novel notion of dissimilarity between magnitude +functions of finite metric spaces and use them to derive a quality measure for +dimensionality reduction tasks. Our measure is provably stable under +perturbations of the data, can be efficiently calculated, and enables a +rigorous multi-scale comparison of embeddings. We show the utility of our +measure in an experimental suite that comprises different domains and tasks, +including the comparison of data visualisations. + +
+
+
+
+
+ + ☆ OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving + + +
+ Understanding how the 3D scene evolves is vital for making decisions in +autonomous driving. Most existing methods achieve this by predicting the +movements of object boxes, which cannot capture more fine-grained scene +information. In this paper, we explore a new framework of learning a world +model, OccWorld, in the 3D Occupancy space to simultaneously predict the +movement of the ego car and the evolution of the surrounding scenes. We propose +to learn a world model based on 3D occupancy rather than 3D bounding boxes and +segmentation maps for three reasons: 1) expressiveness. 3D occupancy can +describe the more fine-grained 3D structure of the scene; 2) efficiency. 3D +occupancy is more economical to obtain (e.g., from sparse LiDAR points). 3) +versatility. 3D occupancy can adapt to both vision and LiDAR. To facilitate the +modeling of the world evolution, we learn a reconstruction-based scene +tokenizer on the 3D occupancy to obtain discrete scene tokens to describe the +surrounding scenes. We then adopt a GPT-like spatial-temporal generative +transformer to generate subsequent scene and ego tokens to decode the future +occupancy and ego trajectory. Extensive experiments on the widely used nuScenes +benchmark demonstrate the ability of OccWorld to effectively model the +evolution of the driving scenes. OccWorld also produces competitive planning +results without using instance and map supervision. Code: +https://github.com/wzzheng/OccWorld. + +
+
+ comment: Code is available at: https://github.com/wzzheng/OccWorld +
+
+
+
+
+ + ☆ RobustState: Boosting Fidelity of Quantum State Preparation via + Noise-Aware Variational Training FAST + + +
+ Quantum state preparation, a crucial subroutine in quantum computing, +involves generating a target quantum state from initialized qubits. Arbitrary +state preparation algorithms can be broadly categorized into arithmetic +decomposition (AD) and variational quantum state preparation (VQSP). AD employs +a predefined procedure to decompose the target state into a series of gates, +whereas VQSP iteratively tunes ansatz parameters to approximate target state. +VQSP is particularly apt for Noisy-Intermediate Scale Quantum (NISQ) machines +due to its shorter circuits. However, achieving noise-robust parameter +optimization still remains challenging. + We present RobustState, a novel VQSP training methodology that combines high +robustness with high training efficiency. The core idea involves utilizing +measurement outcomes from real machines to perform back-propagation through +classical simulators, thus incorporating real quantum noise into gradient +calculations. RobustState serves as a versatile, plug-and-play technique +applicable for training parameters from scratch or fine-tuning existing +parameters to enhance fidelity on target machines. It is adaptable to various +ansatzes at both gate and pulse levels and can even benefit other variational +algorithms, such as variational unitary synthesis. + Comprehensive evaluation of RobustState on state preparation tasks for 4 +distinct quantum algorithms using 10 real quantum machines demonstrates a +coherent error reduction of up to 7.1 $\times$ and state fidelity improvement +of up to 96\% and 81\% for 4-Q and 5-Q states, respectively. On average, +RobustState improves fidelity by 50\% and 72\% for 4-Q and 5-Q states compared +to baseline approaches. + +
+
+ comment: Accepted to FASTML @ ICCAD 2023. 14 pages, 20 figures +
+
+
+
+
+ + ☆ Machine Learning-Enhanced Aircraft Landing Scheduling under + Uncertainties + + +
+ This paper addresses aircraft delays, emphasizing their impact on safety and +financial losses. To mitigate these issues, an innovative machine learning +(ML)-enhanced landing scheduling methodology is proposed, aiming to improve +automation and safety. Analyzing flight arrival delay scenarios reveals strong +multimodal distributions and clusters in arrival flight time durations. A +multi-stage conditional ML predictor enhances separation time prediction based +on flight events. ML predictions are then integrated as safety constraints in a +time-constrained traveling salesman problem formulation, solved using +mixed-integer linear programming (MILP). Historical flight recordings and model +predictions address uncertainties between successive flights, ensuring +reliability. The proposed method is validated using real-world data from the +Atlanta Air Route Traffic Control Center (ARTCC ZTL). Case studies demonstrate +an average 17.2% reduction in total landing time compared to the +First-Come-First-Served (FCFS) rule. Unlike FCFS, the proposed methodology +considers uncertainties, instilling confidence in scheduling. The study +concludes with remarks and outlines future research directions. + +
+
+
+
+
+ + ☆ A Neural Framework for Generalized Causal Sensitivity Analysis + + +
+ Unobserved confounding is common in many applications, making causal +inference from observational data challenging. As a remedy, causal sensitivity +analysis is an important tool to draw causal conclusions under unobserved +confounding with mathematical guarantees. In this paper, we propose NeuralCSA, +a neural framework for generalized causal sensitivity analysis. Unlike previous +work, our framework is compatible with (i) a large class of sensitivity models, +including the marginal sensitivity model, f-sensitivity models, and Rosenbaum's +sensitivity model; (ii) different treatment types (i.e., binary and +continuous); and (iii) different causal queries, including (conditional) +average treatment effects and simultaneous effects on multiple outcomes. The +generality of \frameworkname is achieved by learning a latent distribution +shift that corresponds to a treatment intervention using two conditional +normalizing flows. We provide theoretical guarantees that NeuralCSA is able to +infer valid bounds on the causal query of interest and also demonstrate this +empirically using both simulated and real-world data. + +
+
+
+
+
+ + ☆ Scheduling and Communication Schemes for Decentralized Federated + Learning + + +
+ Federated learning (FL) is a distributed machine learning paradigm in which a +large number of clients coordinate with a central server to learn a model +without sharing their own training data. One central server is not enough, due +to problems of connectivity with clients. In this paper, a decentralized +federated learning (DFL) model with the stochastic gradient descent (SGD) +algorithm has been introduced, as a more scalable approach to improve the +learning performance in a network of agents with arbitrary topology. Three +scheduling policies for DFL have been proposed for communications between the +clients and the parallel servers, and the convergence, accuracy, and loss have +been tested in a totally decentralized mplementation of SGD. The experimental +results show that the proposed scheduling polices have an impact both on the +speed of convergence and in the final global model. + +
+
+ comment: 32nd International Conference on Computer Theory and Applications + (ICCTA), Alexandria, Egypt, 2022 +
+
+
+
+
+ + ☆ Using Decentralized Aggregation for Federated Learning with Differential + Privacy + + +
+ Nowadays, the ubiquitous usage of mobile devices and networks have raised +concerns about the loss of control over personal data and research advance +towards the trade-off between privacy and utility in scenarios that combine +exchange communications, big databases and distributed and collaborative (P2P) +Machine Learning techniques. On the other hand, although Federated Learning +(FL) provides some level of privacy by retaining the data at the local node, +which executes a local training to enrich a global model, this scenario is +still susceptible to privacy breaches as membership inference attacks. To +provide a stronger level of privacy, this research deploys an experimental +environment for FL with Differential Privacy (DP) using benchmark datasets. The +obtained results show that the election of parameters and techniques of DP is +central in the aforementioned trade-off between privacy and utility by means of +a classification example. + +
+
+
+
+
+ + ☆ Improved Data Generation for Enhanced Asset Allocation: A Synthetic + Dataset Approach for the Fixed Income Universe + + +
+ We present a novel process for generating synthetic datasets tailored to +assess asset allocation methods and construct portfolios within the fixed +income universe. Our approach begins by enhancing the CorrGAN model to generate +synthetic correlation matrices. Subsequently, we propose an Encoder-Decoder +model that samples additional data conditioned on a given correlation matrix. +The resulting synthetic dataset facilitates in-depth analyses of asset +allocation methods across diverse asset universes. Additionally, we provide a +case study that exemplifies the use of the synthetic dataset to improve +portfolios constructed within a simulation-based asset allocation process. + +
+
+
+
+
+ + ☆ Forecasting Auxiliary Energy Consumption for Electric Heavy-Duty + Vehicles + + +
+ Accurate energy consumption prediction is crucial for optimizing the +operation of electric commercial heavy-duty vehicles, e.g., route planning for +charging. Moreover, understanding why certain predictions are cast is paramount +for such a predictive model to gain user trust and be deployed in practice. +Since commercial vehicles operate differently as transportation tasks, ambient, +and drivers vary, a heterogeneous population is expected when building an AI +system for forecasting energy consumption. The dependencies between the input +features and the target values are expected to also differ across +sub-populations. One well-known example of such a statistical phenomenon is the +Simpson paradox. In this paper, we illustrate that such a setting poses a +challenge for existing XAI methods that produce global feature statistics, e.g. +LIME or SHAP, causing them to yield misleading results. We demonstrate a +potential solution by training multiple regression models on subsets of data. +It not only leads to superior regression performance but also more relevant and +consistent LIME explanations. Given that the employed groupings correspond to +relevant sub-populations, the associations between the input features and the +target values are consistent within each cluster but different across clusters. +Experiments on both synthetic and real-world datasets show that such splitting +of a complex problem into simpler ones yields better regression performance and +interpretability. + +
+
+
+
+
+ + ☆ Automated Measurement of Vascular Calcification in Femoral + Endarterectomy Patients Using Deep Learning + + +
+ Atherosclerosis, a chronic inflammatory disease affecting the large arteries, +presents a global health risk. Accurate analysis of diagnostic images, like +computed tomographic angiograms (CTAs), is essential for staging and monitoring +the progression of atherosclerosis-related conditions, including peripheral +arterial disease (PAD). However, manual analysis of CTA images is +time-consuming and tedious. To address this limitation, we employed a deep +learning model to segment the vascular system in CTA images of PAD patients +undergoing femoral endarterectomy surgery and to measure vascular calcification +from the left renal artery to the patella. Utilizing proprietary CTA images of +27 patients undergoing femoral endarterectomy surgery provided by Prisma Health +Midlands, we developed a Deep Neural Network (DNN) model to first segment the +arterial system, starting from the descending aorta to the patella, and second, +to provide a metric of arterial calcification. Our designed DNN achieved 83.4% +average Dice accuracy in segmenting arteries from aorta to patella, advancing +the state-of-the-art by 0.8%. Furthermore, our work is the first to present a +robust statistical analysis of automated calcification measurement in the lower +extremities using deep learning, attaining a Mean Absolute Percentage Error +(MAPE) of 9.5% and a correlation coefficient of 0.978 between automated and +manual calcification scores. These findings underscore the potential of deep +learning techniques as a rapid and accurate tool for medical professionals to +assess calcification in the abdominal aorta and its branches above the patella. +The developed DNN model and related documentation in this project are available +at GitHub page at https://github.com/pip-alireza/DeepCalcScoring. + +
+
+ comment: Published in MDPI Diagnostic journal, the code can be accessed via + the GitHub link in the paper +
+
+
+
+
+ + ☆ Closing the ODE-SDE gap in score-based diffusion models through the + Fokker-Planck equation + + +
+ Score-based diffusion models have emerged as one of the most promising +frameworks for deep generative modelling, due to their state-of-the art +performance in many generation tasks while relying on mathematical foundations +such as stochastic differential equations (SDEs) and ordinary differential +equations (ODEs). Empirically, it has been reported that ODE based samples are +inferior to SDE based samples. In this paper we rigorously describe the range +of dynamics and approximations that arise when training score-based diffusion +models, including the true SDE dynamics, the neural approximations, the various +approximate particle dynamics that result, as well as their associated +Fokker--Planck equations and the neural network approximations of these +Fokker--Planck equations. We systematically analyse the difference between the +ODE and SDE dynamics of score-based diffusion models, and link it to an +associated Fokker--Planck equation. We derive a theoretical upper bound on the +Wasserstein 2-distance between the ODE- and SDE-induced distributions in terms +of a Fokker--Planck residual. We also show numerically that conventional +score-based diffusion models can exhibit significant differences between ODE- +and SDE-induced distributions which we demonstrate using explicit comparisons. +Moreover, we show numerically that reducing the Fokker--Planck residual by +adding it as an additional regularisation term leads to closing the gap between +ODE- and SDE-induced distributions. Our experiments suggest that this +regularisation can improve the distribution generated by the ODE, however that +this can come at the cost of degraded SDE sample quality. + +
+
+
+
+
+ + ☆ Sensitivity-Based Layer Insertion for Residual and Feedforward Neural + Networks + + +
+ The training of neural networks requires tedious and often manual tuning of +the network architecture. We propose a systematic method to insert new layers +during the training process, which eliminates the need to choose a fixed +network size before training. Our technique borrows techniques from constrained +optimization and is based on first-order sensitivity information of the +objective with respect to the virtual parameters that additional layers, if +inserted, would offer. We consider fully connected feedforward networks with +selected activation functions as well as residual neural networks. In numerical +experiments, the proposed sensitivity-based layer insertion technique exhibits +improved training decay, compared to not inserting the layer. Furthermore, the +computational effort is reduced in comparison to inserting the layer from the +beginning. The code is available at +\url{https://github.com/LeonieKreis/layer_insertion_sensitivity_based}. + +
+
+
+
+
+ + ☆ Should We Learn Most Likely Functions or Parameters? NeurIPS 2023 + + +
+ Standard regularized training procedures correspond to maximizing a posterior +distribution over parameters, known as maximum a posteriori (MAP) estimation. +However, model parameters are of interest only insomuch as they combine with +the functional form of a model to provide a function that can make good +predictions. Moreover, the most likely parameters under the parameter posterior +do not generally correspond to the most likely function induced by the +parameter posterior. In fact, we can re-parametrize a model such that any +setting of parameters can maximize the parameter posterior. As an alternative, +we investigate the benefits and drawbacks of directly estimating the most +likely function implied by the model and the data. We show that this procedure +leads to pathological solutions when using neural networks and prove conditions +under which the procedure is well-behaved, as well as a scalable approximation. +Under these conditions, we find that function-space MAP estimation can lead to +flatter minima, better generalization, and improved robustness to overfitting. + +
+
+ comment: NeurIPS 2023. Code available at + https://github.com/activatedgeek/function-space-map +
+
+
+
+
+ + ☆ Sparsify-then-Classify: From Internal Neurons of Large Language Models + To Efficient Text Classifiers + + +
+ Among the many tasks that Large Language Models (LLMs) have revolutionized is +text classification. However, existing approaches for applying pretrained LLMs +to text classification predominantly rely on using single token outputs from +only the last layer of hidden states. As a result, they suffer from limitations +in efficiency, task-specificity, and interpretability. In our work, we +contribute an approach that uses all internal representations by employing +multiple pooling strategies on all activation and hidden states. Our novel +lightweight strategy, Sparsify-then-Classify (STC) first sparsifies +task-specific features layer-by-layer, then aggregates across layers for text +classification. STC can be applied as a seamless plug-and-play module on top of +existing LLMs. Our experiments on a comprehensive set of models and datasets +demonstrate that STC not only consistently improves the classification +performance of pretrained and fine-tuned models, but is also more efficient for +both training and inference, and is more intrinsically interpretable. + +
+
+ comment: 23 pages, 5 figures, 8 tables Code available at + https://github.com/difanj0713/Sparsify-then-Classify +
+
+
+
+
+ + ☆ Soil Organic Carbon Estimation from Climate-related Features with Graph + Neural Network + + +
+ Soil organic carbon (SOC) plays a pivotal role in the global carbon cycle, +impacting climate dynamics and necessitating accurate estimation for +sustainable land and agricultural management. While traditional methods of SOC +estimation face resolution and accuracy challenges, recent technological +solutions harness remote sensing, machine learning, and high-resolution +satellite mapping. Graph Neural Networks (GNNs), especially when integrated +with positional encoders, can capture complex relationships between soil and +climate. Using the LUCAS database, this study compared four GNN operators in +the positional encoder framework. Results revealed that the PESAGE and +PETransformer models outperformed others in SOC estimation, indicating their +potential in capturing the complex relationship between SOC and climate +features. Our findings confirm the feasibility of applications of GNN +architectures in SOC prediction, establishing a framework for future +explorations of this topic with more advanced GNN models. + +
+
+
+
+
+ + ☆ Towards Transfer Learning for Large-Scale Image Classification Using + Annealing-based Quantum Boltzmann Machines + + +
+ Quantum Transfer Learning (QTL) recently gained popularity as a hybrid +quantum-classical approach for image classification tasks by efficiently +combining the feature extraction capabilities of large Convolutional Neural +Networks with the potential benefits of Quantum Machine Learning (QML). +Existing approaches, however, only utilize gate-based Variational Quantum +Circuits for the quantum part of these procedures. In this work we present an +approach to employ Quantum Annealing (QA) in QTL-based image classification. +Specifically, we propose using annealing-based Quantum Boltzmann Machines as +part of a hybrid quantum-classical pipeline to learn the classification of +real-world, large-scale data such as medical images through supervised +training. We demonstrate our approach by applying it to the three-class +COVID-CT-MD dataset, a collection of lung Computed Tomography (CT) scan slices. +Using Simulated Annealing as a stand-in for actual QA, we compare our method to +classical transfer learning, using a neural network of the same order of +magnitude, to display its improved classification performance. We find that our +approach consistently outperforms its classical baseline in terms of test +accuracy and AUC-ROC-Score and needs less training epochs to do this. + +
+
+ comment: 7 pages, 3 figures (5 if counting subfigures), 1 table. To be + published in the proceedings of the 2023 IEEE International Conference on + Quantum Computing and Engineering (QCE) +
+
+
+
+
+ + ☆ Efficient Pre-training for Localized Instruction Generation of Videos + + +
+ Procedural videos show step-by-step demonstrations of tasks like recipe +preparation. Understanding such videos is challenging, involving the precise +localization of steps and the generation of textual instructions. Manually +annotating steps and writing instructions is costly, which limits the size of +current datasets and hinders effective learning. Leveraging large but noisy +video-transcript datasets for pre-training can boost performance, but demands +significant computational resources. Furthermore, transcripts contain +irrelevant content and exhibit style variation compared to instructions written +by human annotators. To mitigate both issues, we propose a technique, +Sieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters +irrelevant transcripts and (ii) Swap enhances the quality of the text +instruction by automatically replacing the transcripts with human-written +instructions from a text-only recipe dataset. The curated dataset, three orders +of magnitude smaller than current web-scale datasets, enables efficient +training of large-scale models with competitive performance. We complement our +Sieve-\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step +localization and instruction generation for procedural videos. When this model +is pre-trained on our curated dataset, it achieves state-of-the-art performance +in zero-shot and finetuning settings on YouCook2 and Tasty, while using a +fraction of the computational resources. + +
+
+
+
+
+ + ☆ Maximum Likelihood Estimation is All You Need for Well-Specified + Covariate Shift + + +
+ A key challenge of modern machine learning systems is to achieve +Out-of-Distribution (OOD) generalization -- generalizing to target data whose +distribution differs from that of source data. Despite its significant +importance, the fundamental question of ``what are the most effective +algorithms for OOD generalization'' remains open even under the standard +setting of covariate shift. This paper addresses this fundamental question by +proving that, surprisingly, classical Maximum Likelihood Estimation (MLE) +purely using source data (without any modification) achieves the minimax +optimality for covariate shift under the well-specified setting. That is, no +algorithm performs better than MLE in this setting (up to a constant factor), +justifying MLE is all you need. Our result holds for a very rich class of +parametric models, and does not require any boundedness condition on the +density ratio. We illustrate the wide applicability of our framework by +instantiating it to three concrete examples -- linear regression, logistic +regression, and phase retrieval. This paper further complement the study by +proving that, under the misspecified setting, MLE is no longer the optimal +choice, whereas Maximum Weighted Likelihood Estimator (MWLE) emerges as minimax +optimal in certain scenarios. + +
+
+
+
+
+ + ☆ Addressing Long-Horizon Tasks by Integrating Program Synthesis and State + Machines + + +
+ Deep reinforcement learning excels in various domains but lacks +generalizability and interoperability. Programmatic RL methods (Trivedi et al., +2021; Liu et al., 2023) reformulate solving RL tasks as synthesizing +interpretable programs that can be executed in the environments. Despite +encouraging results, these methods are limited to short-horizon tasks. On the +other hand, representing RL policies using state machines (Inala et al., 2020) +can inductively generalize to long-horizon tasks; however, it struggles to +scale up to acquire diverse and complex behaviors. This work proposes Program +Machine Policies (POMPs), which bridge the advantages of programmatic RL and +state machine policies, allowing for the representation of complex behaviors +and the address of long-term tasks. Specifically, we introduce a method that +can retrieve a set of effective, diverse, compatible programs. Then, we use +these programs as modes of a state machine and learn a transition function to +transition among mode programs, allowing for capturing long-horizon repetitive +behaviors. Our proposed framework outperforms programmatic RL and deep RL +baselines on various tasks and demonstrates the ability to generalize to even +longer horizons without any fine-tuning inductively. Ablation studies justify +the effectiveness of our proposed search algorithm for retrieving a set of +programs as modes. + +
+
+
+
+
+ + ☆ Replay across Experiments: A Natural Extension of Off-Policy RL + + +
+ Replaying data is a principal mechanism underlying the stability and data +efficiency of off-policy reinforcement learning (RL). We present an effective +yet simple framework to extend the use of replays across multiple experiments, +minimally adapting the RL workflow for sizeable improvements in controller +performance and research iteration times. At its core, Replay Across +Experiments (RaE) involves reusing experience from previous experiments to +improve exploration and bootstrap learning while reducing required changes to a +minimum in comparison to prior work. We empirically show benefits across a +number of RL algorithms and challenging control domains spanning both +locomotion and manipulation, including hard exploration tasks from egocentric +vision. Through comprehensive ablations, we demonstrate robustness to the +quality and amount of data available and various hyperparameter choices. +Finally, we discuss how our approach can be applied more broadly across +research life cycles and can increase resilience by reloading data across +random seeds or hyperparameter variations. + +
+
+
+
+
+ + ☆ GloNets: Globally Connected Neural Networks + + +
+ Deep learning architectures suffer from depth-related performance +degradation, limiting the effective depth of neural networks. Approaches like +ResNet are able to mitigate this, but they do not completely eliminate the +problem. We introduce Globally Connected Neural Networks (GloNet), a novel +architecture overcoming depth-related issues, designed to be superimposed on +any model, enhancing its depth without increasing complexity or reducing +performance. With GloNet, the network's head uniformly receives information +from all parts of the network, regardless of their level of abstraction. This +enables GloNet to self-regulate information flow during training, reducing the +influence of less effective deeper layers, and allowing for stable training +irrespective of network depth. This paper details GloNet's design, its +theoretical basis, and a comparison with existing similar architectures. +Experiments show GloNet's self-regulation ability and resilience to +depth-related learning challenges, like performance degradation. Our findings +suggest GloNet as a strong alternative to traditional architectures like +ResNets. + +
+
+
+
+
+ + ☆ Over-Squashing in Riemannian Graph Neural Networks + + +
+ Most graph neural networks (GNNs) are prone to the phenomenon of +over-squashing in which node features become insensitive to information from +distant nodes in the graph. Recent works have shown that the topology of the +graph has the greatest impact on over-squashing, suggesting graph rewiring +approaches as a suitable solution. In this work, we explore whether +over-squashing can be mitigated through the embedding space of the GNN. In +particular, we consider the generalization of Hyperbolic GNNs (HGNNs) to +Riemannian manifolds of variable curvature in which the geometry of the +embedding space is faithful to the graph's topology. We derive bounds on the +sensitivity of the node features in these Riemannian GNNs as the number of +layers increases, which yield promising theoretical and empirical results for +alleviating over-squashing in graphs with negative curvature. + +
+
+
+
+
+ + ☆ Physics-informed neural networks for transformed geometries and + manifolds + + +
+ Physics-informed neural networks (PINNs) effectively embed physical +principles into machine learning, but often struggle with complex or +alternating geometries. We propose a novel method for integrating geometric +transformations within PINNs to robustly accommodate geometric variations. Our +method incorporates a diffeomorphism as a mapping of a reference domain and +adapts the derivative computation of the physics-informed loss function. This +generalizes the applicability of PINNs not only to smoothly deformed domains, +but also to lower-dimensional manifolds and allows for direct shape +optimization while training the network. We demonstrate the effectivity of our +approach on several problems: (i) Eikonal equation on Archimedean spiral, (ii) +Poisson problem on surface manifold, (iii) Incompressible Stokes flow in +deformed tube, and (iv) Shape optimization with Laplace operator. Through these +examples, we demonstrate the enhanced flexibility over traditional PINNs, +especially under geometric variations. The proposed framework presents an +outlook for training deep neural operators over parametrized geometries, paving +the way for advanced modeling with PDEs on complex geometries in science and +engineering. + +
+
+
+
+
+ + ☆ Towards Responsible Governance of Biological Design Tools NeurIPS 2023 + + +
+ Recent advancements in generative machine learning have enabled rapid +progress in biological design tools (BDTs) such as protein structure and +sequence prediction models. The unprecedented predictive accuracy and novel +design capabilities of BDTs present new and significant dual-use risks. For +example, their predictive accuracy allows biological agents, whether vaccines +or pathogens, to be developed more quickly, while the design capabilities could +be used to discover drugs or evade DNA screening techniques. Similar to other +dual-use AI systems, BDTs present a wicked problem: how can regulators uphold +public safety without stifling innovation? We highlight how current regulatory +proposals that are primarily tailored toward large language models may be less +effective for BDTs, which require fewer computational resources to train and +are often developed in an open-source manner. We propose a range of measures to +mitigate the risk that BDTs are misused, across the areas of responsible +development, risk assessment, transparency, access management, cybersecurity, +and investing in resilience. Implementing such measures will require close +coordination between developers and governments. + +
+
+ comment: 10 pages + references, 1 figure, accepted at NeurIPS 2023 Regulatable + ML as oral presentation +
+
+
+
+
+ + ☆ Reinforcement Learning for Wildfire Mitigation in Simulated Disaster + Environments NeurIPS 2023 + + +
+ Climate change has resulted in a year over year increase in adverse weather +and weather conditions which contribute to increasingly severe fire seasons. +Without effective mitigation, these fires pose a threat to life, property, +ecology, cultural heritage, and critical infrastructure. To better prepare for +and react to the increasing threat of wildfires, more accurate fire modelers +and mitigation responses are necessary. In this paper, we introduce SimFire, a +versatile wildland fire projection simulator designed to generate realistic +wildfire scenarios, and SimHarness, a modular agent-based machine learning +wrapper capable of automatically generating land management strategies within +SimFire to reduce the overall damage to the area. Together, this publicly +available system allows researchers and practitioners the ability to emulate +and assess the effectiveness of firefighter interventions and formulate +strategic plans that prioritize value preservation and resource allocation +optimization. The repositories are available for download at +https://github.com/mitrefireline. + +
+
+ comment: 12 pages, 4 figures including Appendices (A, B). Accepted as a paper + in the Proposals track at the "Tackling Climate Change with Machine Learning" + workshop at NeurIPS 2023. MITRE Public Release Case Number 23-3920 +
+
+
+
+
+ + ☆ Diagnosis driven Anomaly Detection for CPS + + +
+ In Cyber-Physical Systems (CPS) research, anomaly detection (detecting +abnormal behavior) and diagnosis (identifying the underlying root cause) are +often treated as distinct, isolated tasks. However, diagnosis algorithms +require symptoms, i.e. temporally and spatially isolated anomalies, as input. +Thus, anomaly detection and diagnosis must be developed together to provide a +holistic solution for diagnosis in CPS. We therefore propose a method for +utilizing deep learning-based anomaly detection to generate inputs for +Consistency-Based Diagnosis (CBD). We evaluate our approach on a simulated and +a real-world CPS dataset, where our model demonstrates strong performance +relative to other state-of-the-art models. + +
+
+
+
+
+ + ☆ MetaDefa: Meta-learning based on Domain Enhancement and Feature + Alignment for Single Domain Generalization + + +
+ The single domain generalization(SDG) based on meta-learning has emerged as +an effective technique for solving the domain-shift problem. However, the +inadequate match of data distribution between source and augmented domains and +difficult separation of domain-invariant features from domain-related features +make SDG model hard to achieve great generalization. Therefore, a novel +meta-learning method based on domain enhancement and feature alignment +(MetaDefa) is proposed to improve the model generalization performance. First, +the background substitution and visual corruptions techniques are used to +generate diverse and effective augmented domains. Then, the multi-channel +feature alignment module based on class activation maps and class agnostic +activation maps is designed to effectively extract adequate transferability +knowledge. In this module, domain-invariant features can be fully explored by +focusing on similar target regions between source and augmented domains feature +space and suppressing the feature representation of non-similar target regions. +Extensive experiments on two publicly available datasets show that MetaDefa has +significant generalization performance advantages in unknown multiple target +domains. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Stability-Informed Initialization of Neural Ordinary Differential + Equations + + +
+ This paper addresses the training of Neural Ordinary Differential Equations +(neural ODEs), and in particular explores the interplay between numerical +integration techniques, stability regions, step size, and initialization +techniques. It is shown how the choice of integration technique implicitly +regularizes the learned model, and how the solver's corresponding stability +region affects training and prediction performance. From this analysis, a +stability-informed parameter initialization technique is introduced. The +effectiveness of the initialization method is displayed across several learning +benchmarks and industrial applications. + +
+
+
+
+
+ + ☆ FLASC: A Flare-Sensitive Clustering Algorithm: Extending HDBSCAN* for + Detecting Branches in Clusters KDD + + +
+ We present FLASC, an algorithm for flare-sensitive clustering. Our algorithm +builds upon HDBSCAN* -- which provides high-quality density-based clustering +performance -- through a post-processing step that differentiates branches +within the detected clusters' manifold, adding a type of pattern that can be +discovered. Two variants of the algorithm are presented, which trade +computational cost for noise robustness. We show that both variants scale +similarly to HDBSCAN* in terms of computational cost and provide stable outputs +using synthetic data sets, resulting in an efficient flare-sensitive clustering +algorithm. In addition, we demonstrate the algorithm's benefit in data +exploration over HDBSCAN* clustering on two real-world data sets. + +
+
+ comment: 20 pages, 11 figures, submitted to ACM TKDD +
+
+
+
+
+ + ☆ RO-LLaMA: Generalist LLM for Radiation Oncology via Noise Augmentation + and Consistency Regularization + + +
+ Recent advancements in Artificial Intelligence (AI) have profoundly +influenced medical fields, by providing tools to reduce clinical workloads. +However, most AI models are constrained to execute uni-modal tasks, in stark +contrast to the comprehensive approaches utilized by medical professionals. To +address this, here we present RO-LLaMA, a versatile generalist large language +model (LLM) tailored for the field of radiation oncology. This model seamlessly +covers a wide range of the workflow of radiation oncologists, adept at various +tasks such as clinical report summarization, radiation therapy plan suggestion, +and plan-guided therapy target volume segmentation. In particular, to maximize +the end-to-end performance, we further present a novel Consistency Embedding +Fine-Tuning (CEFTune) technique, which boosts LLM's robustness to additional +errors at the intermediates while preserving the capability of handling clean +inputs, and creatively transform this concept into LLM-driven segmentation +framework as Consistency Embedding Segmentation (CESEG). Experimental results +on multi-centre cohort sets demonstrate our proposed RO-LLaMA's promising +performance for diverse tasks with generalization capabilities. + +
+
+
+
+
+ + ☆ Nodal Hydraulic Head Estimation through Unscented Kalman Filter for + Data-driven Leak Localization in Water Networks + + +
+ In this paper, we present a nodal hydraulic head estimation methodology for +water distribution networks (WDN) based on an Unscented Kalman Filter (UKF) +scheme with application to leak localization. The UKF refines an initial +estimation of the hydraulic state by considering the prediction model, as well +as available pressure and demand measurements. To this end, it provides +customized prediction and data assimilation steps. Additionally, the method is +enhanced by dynamically updating the prediction function weight matrices. +Performance testing on the Modena benchmark under realistic conditions +demonstrates the method's effectiveness in enhancing state estimation and +data-driven leak localization. + +
+
+ comment: This work has been submitted to IFAC for possible publication. It has + 6 pages and 3 figures +
+
+
+
+
+ + ☆ A precise symbolic emulator of the linear matter power spectrum + + +
+ Computing the matter power spectrum, $P(k)$, as a function of cosmological +parameters can be prohibitively slow in cosmological analyses, hence emulating +this calculation is desirable. Previous analytic approximations are +insufficiently accurate for modern applications, so black-box, uninterpretable +emulators are often used. We utilise an efficient genetic programming based +symbolic regression framework to explore the space of potential mathematical +expressions which can approximate the power spectrum and $\sigma_8$. We learn +the ratio between an existing low-accuracy fitting function for $P(k)$ and that +obtained by solving the Boltzmann equations and thus still incorporate the +physics which motivated this earlier approximation. We obtain an analytic +approximation to the linear power spectrum with a root mean squared fractional +error of 0.2% between $k = 9\times10^{-3} - 9 \, h{\rm \, Mpc^{-1}}$ and across +a wide range of cosmological parameters, and we provide physical +interpretations for various terms in the expression. We also provide a simple +analytic approximation for $\sigma_8$ with a similar accuracy, with a root mean +squared fractional error of just 0.4% when evaluated across the same range of +cosmologies. This function is easily invertible to obtain $A_{\rm s}$ as a +function of $\sigma_8$ and the other cosmological parameters, if preferred. It +is possible to obtain symbolic approximations to a seemingly complex function +at a precision required for current and future cosmological analyses without +resorting to deep-learning techniques, thus avoiding their black-box nature and +large number of parameters. Our emulator will be usable long after the codes on +which numerical approximations are built become outdated. + +
+
+ comment: 9 pages, 5 figures. Submitted to A&A +
+
+
+
+
+ + ☆ Multi-Agent Reinforcement Learning for Power Control in Wireless + Networks via Adaptive Graphs + + +
+ The ever-increasing demand for high-quality and heterogeneous wireless +communication services has driven extensive research on dynamic optimization +strategies in wireless networks. Among several possible approaches, multi-agent +deep reinforcement learning (MADRL) has emerged as a promising method to +address a wide range of complex optimization problems like power control. +However, the seamless application of MADRL to a variety of network optimization +problems faces several challenges related to convergence. In this paper, we +present the use of graphs as communication-inducing structures among +distributed agents as an effective means to mitigate these challenges. +Specifically, we harness graph neural networks (GNNs) as neural architectures +for policy parameterization to introduce a relational inductive bias in the +collective decision-making process. Most importantly, we focus on modeling the +dynamic interactions among sets of neighboring agents through the introduction +of innovative methods for defining a graph-induced framework for integrated +communication and learning. Finally, the superior generalization capabilities +of the proposed methodology to larger networks and to networks with different +user categories is verified through simulations. + +
+
+ comment: 6 pages, 4 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ A systematic study comparing hyperparameter optimization engines on + tabular data + + +
+ We run an independent comparison of all hyperparameter optimization +(hyperopt) engines available in the Ray Tune library. We introduce two ways to +normalize and aggregate statistics across data sets and models, one rank-based, +and another one sandwiching the score between the random search score and the +full grid search score. This affords us i) to rank the hyperopt engines, ii) to +make generalized and statistically significant statements on how much they +improve over random search, and iii) to make recommendations on which engine +should be used to hyperopt a given learning algorithm. We find that most +engines beat random search, but that only three of them (HEBO, AX, and +BlendSearch) clearly stand out. We also found that some engines seem to +specialize in hyperopting certain learning algorithms, which makes it tricky to +use hyperopt in comparison studies, since the choice of the hyperopt technique +may favor some of the models in the comparison. + +
+
+
+
+
+ + ☆ Cell Maps Representation For Lung Adenocarcinoma Growth Patterns + Classification In Whole Slide Images + + +
+ Lung adenocarcinoma is a morphologically heterogeneous disease, characterized +by five primary histologic growth patterns. The quantity of these patterns can +be related to tumor behavior and has a significant impact on patient prognosis. +In this work, we propose a novel machine learning pipeline capable of +classifying tissue tiles into one of the five patterns or as non-tumor, with an +Area Under the Receiver Operating Characteristic Curve (AUCROC) score of 0.97. +Our model's strength lies in its comprehensive consideration of cellular +spatial patterns, where it first generates cell maps from Hematoxylin and Eosin +(H&E) whole slide images (WSIs), which are then fed into a convolutional neural +network classification model. Exploiting these cell maps provides the model +with robust generalizability to new data, achieving approximately 30% higher +accuracy on unseen test-sets compared to current state of the art approaches. +The insights derived from our model can be used to predict prognosis, enhancing +patient outcomes. + +
+
+
+
+
+ + ☆ Utilizing Explainability Techniques for Reinforcement Learning Model + Assurance NeurIPS 2023 + + +
+ Explainable Reinforcement Learning (XRL) can provide transparency into the +decision-making process of a Deep Reinforcement Learning (DRL) model and +increase user trust and adoption in real-world use cases. By utilizing XRL +techniques, researchers can identify potential vulnerabilities within a trained +DRL model prior to deployment, therefore limiting the potential for mission +failure or mistakes by the system. This paper introduces the ARLIN (Assured RL +Model Interrogation) Toolkit, an open-source Python library that identifies +potential vulnerabilities and critical points within trained DRL models through +detailed, human-interpretable explainability outputs. To illustrate ARLIN's +effectiveness, we provide explainability visualizations and vulnerability +analysis for a publicly available DRL model. The open-source code repository is +available for download at https://github.com/mitre/arlin. + +
+
+ comment: 9 pages, 8 figures including appendices (A, B, C). Accepted as a + poster presentation in the demo track at the "XAI in Action: Past, Present, + and Future Applications" workshop at NeurIPS 2023. MITRE Public Release Case + Number 23-3095 +
+
+
+
+
+ + ☆ Temporal Action Localization for Inertial-based Human Activity + Recognition + + +
+ A persistent trend in Deep Learning has been the applicability of machine +learning concepts to other areas than originally introduced for. As of today, +state-of-the-art activity recognition from wearable sensors relies on +classifiers being trained on fixed windows of data. Contrarily, video-based +Human Activity Recognition has followed a segment-based prediction approach, +localizing activity occurrences from start to end. This paper is the first to +systematically demonstrate the applicability of state-of-the-art TAL models for +wearable Human Activity Recongition (HAR) using raw inertial data as input. Our +results show that state-of-the-art TAL models are able to outperform popular +inertial models on 4 out of 6 wearable activity recognition benchmark datasets, +with improvements ranging as much as 25% in F1-score. Introducing the TAL +community's most popular metric to inertial-based HAR, namely mean Average +Precision, our analysis shows that TAL models are able to produce more coherent +segments along with an overall higher NULL-class accuracy across all datasets. +Being the first to provide such an analysis, the TAL community offers an +interesting new perspective to inertial-based HAR with yet to be explored +design choices and training concepts, which could be of significant value for +the inertial-based HAR community. + +
+
+ comment: 20 pages, 7 figures, 2 tables +
+
+
+
+
+ + ☆ Scale-Dropout: Estimating Uncertainty in Deep Neural Networks Using + Stochastic Scale + + +
+ Uncertainty estimation in Neural Networks (NNs) is vital in improving +reliability and confidence in predictions, particularly in safety-critical +applications. Bayesian Neural Networks (BayNNs) with Dropout as an +approximation offer a systematic approach to quantifying uncertainty, but they +inherently suffer from high hardware overhead in terms of power, memory, and +computation. Thus, the applicability of BayNNs to edge devices with limited +resources or to high-performance applications is challenging. Some of the +inherent costs of BayNNs can be reduced by accelerating them in hardware on a +Computation-In-Memory (CIM) architecture with spintronic memories and +binarizing their parameters. However, numerous stochastic units are required to +implement conventional dropout-based BayNN. In this paper, we propose the Scale +Dropout, a novel regularization technique for Binary Neural Networks (BNNs), +and Monte Carlo-Scale Dropout (MC-Scale Dropout)-based BayNNs for efficient +uncertainty estimation. Our approach requires only one stochastic unit for the +entire model, irrespective of the model size, leading to a highly scalable +Bayesian NN. Furthermore, we introduce a novel Spintronic memory-based CIM +architecture for the proposed BayNN that achieves more than $100\times$ energy +savings compared to the state-of-the-art. We validated our method to show up to +a $1\%$ improvement in predictive performance and superior uncertainty +estimates compared to related works. + +
+
+
+
+
+ + ☆ Exploring Artificial Intelligence Methods for Energy Prediction in + Healthcare Facilities: An In-Depth Extended Systematic Review + + +
+ Hospitals, due to their complexity and unique requirements, play a pivotal +role in global energy consumption patterns. This study conducted a +comprehensive literature review, utilizing the PRISMA framework, of articles +that employed machine learning and artificial intelligence techniques for +predicting energy consumption in hospital buildings. Of the 1884 publications +identified, 17 were found to address this specific domain and have been +thoroughly reviewed to establish the state-of-the-art and identify gaps where +future research is needed. This review revealed a diverse range of data inputs +influencing energy prediction, with occupancy and meteorological data emerging +as significant predictors. However, many studies failed to delve deep into the +implications of their data choices, and gaps were evident regarding the +understanding of time dynamics, operational status, and preprocessing methods. +Machine learning, especially deep learning models like ANNs, have shown +potential in this domain, yet they come with challenges, including +interpretability and computational demands. The findings underscore the immense +potential of AI in optimizing hospital energy consumption but also highlight +the need for more comprehensive and granular research. Key areas for future +research include the optimization of ANN approaches, new optimization and data +integration techniques, the integration of real-time data into Intelligent +Energy Management Systems, and increasing focus on long-term energy +forecasting. + +
+
+ comment: 38 pages, 1 figure, 3 tables, systematic literature review +
+
+
+
+
+ + ☆ Rethinking Privacy in Machine Learning Pipelines from an Information + Flow Control Perspective + + +
+ Modern machine learning systems use models trained on ever-growing corpora. +Typically, metadata such as ownership, access control, or licensing information +is ignored during training. Instead, to mitigate privacy risks, we rely on +generic techniques such as dataset sanitization and differentially private +model training, with inherent privacy/utility trade-offs that hurt model +performance. Moreover, these techniques have limitations in scenarios where +sensitive information is shared across multiple participants and fine-grained +access control is required. By ignoring metadata, we therefore miss an +opportunity to better address security, privacy, and confidentiality +challenges. In this paper, we take an information flow control perspective to +describe machine learning systems, which allows us to leverage metadata such as +access control policies and define clear-cut privacy and confidentiality +guarantees with interpretable information flows. Under this perspective, we +contrast two different approaches to achieve user-level non-interference: 1) +fine-tuning per-user models, and 2) retrieval augmented models that access +user-specific datasets at inference time. We compare these two approaches to a +trivially non-interfering zero-shot baseline using a public model and to a +baseline that fine-tunes this model on the whole corpus. We evaluate trained +models on two datasets of scientific articles and demonstrate that retrieval +augmented architectures deliver the best utility, scalability, and flexibility +while satisfying strict non-interference guarantees. + +
+
+
+
+
+ + ☆ Relationship between Model Compression and Adversarial Robustness: A + Review of Current Evidence SC + + +
+ Increasing the model capacity is a known approach to enhance the adversarial +robustness of deep learning networks. On the other hand, various model +compression techniques, including pruning and quantization, can reduce the size +of the network while preserving its accuracy. Several recent studies have +addressed the relationship between model compression and adversarial +robustness, while some experiments have reported contradictory results. This +work summarizes available evidence and discusses possible explanations for the +observed effects. + +
+
+ comment: Accepted for publication at SSCI 2023 +
+
+
+
+
+ + ☆ Increasing Coverage and Precision of Textual Information in Multilingual + Knowledge Graphs EMNLP 2023 + + +
+ Recent work in Natural Language Processing and Computer Vision has been using +textual information -- e.g., entity names and descriptions -- available in +knowledge graphs to ground neural models to high-quality structured data. +However, when it comes to non-English languages, the quantity and quality of +textual information are comparatively scarce. To address this issue, we +introduce the novel task of automatic Knowledge Graph Enhancement (KGE) and +perform a thorough investigation on bridging the gap in both the quantity and +quality of textual information between English and non-English languages. More +specifically, we: i) bring to light the problem of increasing multilingual +coverage and precision of entity names and descriptions in Wikidata; ii) +demonstrate that state-of-the-art methods, namely, Machine Translation (MT), +Web Search (WS), and Large Language Models (LLMs), struggle with this task; +iii) present M-NTA, a novel unsupervised approach that combines MT, WS, and +LLMs to generate high-quality textual information; and, iv) study the impact of +increasing multilingual coverage and precision of non-English textual +information in Entity Linking, Knowledge Graph Completion, and Question +Answering. As part of our effort towards better multilingual knowledge graphs, +we also introduce WikiKGE-10, the first human-curated benchmark to evaluate KGE +approaches in 10 languages across 7 language families. + +
+
+ comment: Camera ready for EMNLP 2023 +
+
+
+
+
+ + ☆ Attend Who is Weak: Enhancing Graph Condensation via Cross-Free + Adversarial Training + + +
+ In this paper, we study the \textit{graph condensation} problem by +compressing the large, complex graph into a concise, synthetic representation +that preserves the most essential and discriminative information of structure +and features. We seminally propose the concept of Shock Absorber (a type of +perturbation) that enhances the robustness and stability of the original graphs +against changes in an adversarial training fashion. Concretely, (I) we forcibly +match the gradients between pre-selected graph neural networks (GNNs) trained +on a synthetic, simplified graph and the original training graph at regularly +spaced intervals. (II) Before each update synthetic graph point, a Shock +Absorber serves as a gradient attacker to maximize the distance between the +synthetic dataset and the original graph by selectively perturbing the parts +that are underrepresented or insufficiently informative. We iteratively repeat +the above two processes (I and II) in an adversarial training fashion to +maintain the highly-informative context without losing correlation with the +original dataset. More importantly, our shock absorber and the synthesized +graph parallelly share the backward process in a free training manner. Compared +to the original adversarial training, it introduces almost no additional time +overhead. + We validate our framework across 8 datasets (3 graph and 5 node +classification datasets) and achieve prominent results: for example, on Cora, +Citeseer and Ogbn-Arxiv, we can gain nearly 1.13% to 5.03% improvements compare +with SOTA models. Moreover, our algorithm adds only about 0.2% to 2.2% +additional time overhead over Flicker, Citeseer and Ogbn-Arxiv. Compared to the +general adversarial training, our approach improves time efficiency by nearly +4-fold. + +
+
+
+
+
+ + ☆ Learning Multi-Frequency Partial Correlation Graphs + + +
+ Despite the large research effort devoted to learning dependencies between +time series, the state of the art still faces a major limitation: existing +methods learn partial correlations but fail to discriminate across distinct +frequency bands. Motivated by many applications in which this differentiation +is pivotal, we overcome this limitation by learning a block-sparse, +frequency-dependent, partial correlation graph, in which layers correspond to +different frequency bands, and partial correlations can occur over just a few +layers. To this aim, we formulate and solve two nonconvex learning problems: +the first has a closed-form solution and is suitable when there is prior +knowledge about the number of partial correlations; the second hinges on an +iterative solution based on successive convex approximation, and is effective +for the general case where no prior knowledge is available. Numerical results +on synthetic data show that the proposed methods outperform the current state +of the art. Finally, the analysis of financial time series confirms that +partial correlations exist only within a few frequency bands, underscoring how +our methods enable the gaining of valuable insights that would be undetected +without discriminating along the frequency domain. + +
+
+
+
+
+ + ☆ Adinkra Symbol Recognition using Classical Machine Learning and Deep + Learning + + +
+ Artificial intelligence (AI) has emerged as a transformative influence, +engendering paradigm shifts in global societies, spanning academia and +industry. However, in light of these rapid advances, addressing the +underrepresentation of black communities and African countries in AI is +crucial. Boosting enthusiasm for AI can be effectively accomplished by +showcasing straightforward applications around tasks like identifying and +categorizing traditional symbols, such as Adinkra symbols, or familiar objects +within the community. In this research endeavor, we dived into classical +machine learning and harnessed the power of deep learning models to tackle the +intricate task of classifying and recognizing Adinkra symbols. The idea led to +a newly constructed ADINKRA dataset comprising 174,338 images meticulously +organized into 62 distinct classes, each representing a singular and emblematic +symbol. We constructed a CNN model for classification and recognition using six +convolutional layers, three fully connected (FC) layers, and optional dropout +regularization. The model is a simpler and smaller version of VGG, with fewer +layers, smaller channel sizes, and a fixed kernel size. Additionally, we tap +into the transfer learning capabilities provided by pre-trained models like VGG +and ResNet. These models assist us in both classifying images and extracting +features that can be used with classical machine learning models. We assess the +model's performance by measuring its accuracy and convergence rate and +visualizing the areas that significantly influence its predictions. These +evaluations serve as a foundational benchmark for future assessments of the +ADINKRA dataset. We hope this application exemplar inspires ideas on the +various uses of AI in organizing our traditional and modern lives. + +
+
+ comment: 15 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ GLIME: General, Stable and Local LIME Explanation NeurIPS 2023 + + +
+ As black-box machine learning models grow in complexity and find applications +in high-stakes scenarios, it is imperative to provide explanations for their +predictions. Although Local Interpretable Model-agnostic Explanations (LIME) +[22] is a widely adpoted method for understanding model behaviors, it is +unstable with respect to random seeds [35,24,3] and exhibits low local fidelity +(i.e., how well the explanation approximates the model's local behaviors) +[21,16]. Our study shows that this instability problem stems from small sample +weights, leading to the dominance of regularization and slow convergence. +Additionally, LIME's sampling neighborhood is non-local and biased towards the +reference, resulting in poor local fidelity and sensitivity to reference +choice. To tackle these challenges, we introduce GLIME, an enhanced framework +extending LIME and unifying several prior methods. Within the GLIME framework, +we derive an equivalent formulation of LIME that achieves significantly faster +convergence and improved stability. By employing a local and unbiased sampling +distribution, GLIME generates explanations with higher local fidelity compared +to LIME. GLIME explanations are independent of reference choice. Moreover, +GLIME offers users the flexibility to choose a sampling distribution based on +their specific scenarios. + +
+
+ comment: Accepted by NeurIPS 2023 as a Spotlight paper +
+
+
+
+
+ + ☆ Variational Autoencoders for Feature Exploration and Malignancy + Prediction of Lung Lesions BMVC 2023 + + +
+ Lung cancer is responsible for 21% of cancer deaths in the UK and five-year +survival rates are heavily influenced by the stage the cancer was identified +at. Recent studies have demonstrated the capability of AI methods for accurate +and early diagnosis of lung cancer from routine scans. However, this evidence +has not translated into clinical practice with one barrier being a lack of +interpretable models. This study investigates the application Variational +Autoencoders (VAEs), a type of generative AI model, to lung cancer lesions. +Proposed models were trained on lesions extracted from 3D CT scans in the +LIDC-IDRI public dataset. Latent vector representations of 2D slices produced +by the VAEs were explored through clustering to justify their quality and used +in an MLP classifier model for lung cancer diagnosis, the best model achieved +state-of-the-art metrics of AUC 0.98 and 93.1% accuracy. Cluster analysis shows +the VAE latent space separates the dataset of malignant and benign lesions +based on meaningful feature components including tumour size, shape, patient +and malignancy class. We also include a comparative analysis of the standard +Gaussian VAE (GVAE) and the more recent Dirichlet VAE (DirVAE), which replaces +the prior with a Dirichlet distribution to encourage a more explainable latent +space with disentangled feature representation. Finally, we demonstrate the +potential for latent space traversals corresponding to clinically meaningful +feature changes. + +
+
+ comment: 10 pages (main paper), 5 pages (references), 5 figures, 2 tables, + work accepted for BMVC 2023 +
+
+
+
+
+ + ☆ Tabular Two-Dimensional Correlation Analysis for Multifaceted + Characterization Data + + +
+ We propose tabular two-dimensional correlation analysis for extracting +features from multifaceted characterization data, essential for understanding +material properties. This method visualizes similarities and phase lags in +structural parameter changes through heatmaps, combining hierarchical +clustering and asynchronous correlations. We applied the proposed method to +datasets of carbon nanotube (CNTs) films annealed at various temperatures and +revealed the complexity of their hierarchical structures, which include +elements like voids, bundles, and amorphous carbon. Our analysis addresses the +challenge of attempting to understand the sequence of structural changes, +especially in multifaceted characterization data where 11 structural parameters +derived from 8 characterization methods interact with complex behavior. The +results show how phase lags (asynchronous changes from stimuli) and parameter +similarities can illuminate the sequence of structural changes in materials, +providing insights into phenomena like the removal of amorphous carbon and +graphitization in annealed CNTs. This approach is beneficial even with limited +data and holds promise for a wide range of material analyses, demonstrating its +potential in elucidating complex material behaviors and properties. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ Peptide Binding Classification on Quantum Computers + + +
+ We conduct an extensive study on using near-term quantum computers for a task +in the domain of computational biology. By constructing quantum models based on +parameterised quantum circuits we perform sequence classification on a task +relevant to the design of therapeutic proteins, and find competitive +performance with classical baselines of similar scale. To study the effect of +noise, we run some of the best-performing quantum models with favourable +resource requirements on emulators of state-of-the-art noisy quantum +processors. We then apply error mitigation methods to improve the signal. We +further execute these quantum models on the Quantinuum H1-1 trapped-ion quantum +processor and observe very close agreement with noiseless exact simulation. +Finally, we perform feature attribution methods and find that the quantum +models indeed identify sensible relationships, at least as well as the +classical baselines. This work constitutes the first proof-of-concept +application of near-term quantum computing to a task critical to the design of +therapeutic proteins, opening the route toward larger-scale applications in +this and related fields, in line with the hardware development roadmaps of +near-term quantum technologies. + +
+
+
+
+
+ + ☆ Automated discovery of trade-off between utility, privacy and fairness + in machine learning models ECML 2023 + + +
+ Machine learning models are deployed as a central component in decision +making and policy operations with direct impact on individuals' lives. In order +to act ethically and comply with government regulations, these models need to +make fair decisions and protect the users' privacy. However, such requirements +can come with decrease in models' performance compared to their potentially +biased, privacy-leaking counterparts. Thus the trade-off between fairness, +privacy and performance of ML models emerges, and practitioners need a way of +quantifying this trade-off to enable deployment decisions. In this work we +interpret this trade-off as a multi-objective optimization problem, and propose +PFairDP, a pipeline that uses Bayesian optimization for discovery of +Pareto-optimal points between fairness, privacy and utility of ML models. We +show how PFairDP can be used to replicate known results that were achieved +through manual constraint setting process. We further demonstrate effectiveness +of PFairDP with experiments on multiple models and datasets. + +
+
+ comment: 3rd Workshop on Bias and Fairness in AI (BIAS), ECML 2023 +
+
+
+
+
+ + ☆ The Battleship Approach to the Low Resource Entity Matching Problem + + +
+ Entity matching, a core data integration problem, is the task of deciding +whether two data tuples refer to the same real-world entity. Recent advances in +deep learning methods, using pre-trained language models, were proposed for +resolving entity matching. Although demonstrating unprecedented results, these +solutions suffer from a major drawback as they require large amounts of labeled +data for training, and, as such, are inadequate to be applied to low resource +entity matching problems. To overcome the challenge of obtaining sufficient +labeled data we offer a new active learning approach, focusing on a selection +mechanism that exploits unique properties of entity matching. We argue that a +distributed representation of a tuple pair indicates its informativeness when +considered among other pairs. This is used consequently in our approach that +iteratively utilizes space-aware considerations. Bringing it all together, we +treat the low resource entity matching problem as a Battleship game, hunting +indicative samples, focusing on positive ones, through awareness of the latent +space along with careful planning of next sampling iterations. An extensive +experimental analysis shows that the proposed algorithm outperforms +state-of-the-art active learning solutions to low resource entity matching, and +although using less samples, can be as successful as state-of-the-art fully +trained known algorithms. + +
+
+
+
+
+ + ☆ Information theoretic study of the neural geometry induced by category + learning NeurIPS 2023 + + +
+ Categorization is an important topic both for biological and artificial +neural networks. Here, we take an information theoretic approach to assess the +efficiency of the representations induced by category learning. We show that +one can decompose the relevant Bayesian cost into two components, one for the +coding part and one for the decoding part. Minimizing the coding cost implies +maximizing the mutual information between the set of categories and the neural +activities. We analytically show that this mutual information can be written as +the sum of two terms that can be interpreted as (i) finding an appropriate +representation space, and, (ii) building a representation with the appropriate +metrics, based on the neural Fisher information on this space. One main +consequence is that category learning induces an expansion of neural space near +decision boundaries. Finally, we provide numerical illustrations that show how +Fisher information of the coding neural population aligns with the boundaries +between categories. + +
+
+ comment: 7 pages, 2 figures, Accepted (Oral) to InfoCog@NeurIPS 2023 +
+
+
+
+
+ + ☆ Accelerating Hierarchical Associative Memory: A Deep Equilibrium + Approach NeurIPS + + +
+ Hierarchical Associative Memory models have recently been proposed as a +versatile extension of continuous Hopfield networks. In order to facilitate +future research on such models, especially at scale, we focus on increasing +their simulation efficiency on digital hardware. In particular, we propose two +strategies to speed up memory retrieval in these models, which corresponds to +their use at inference, but is equally important during training. First, we +show how they can be cast as Deep Equilibrium Models, which allows using faster +and more stable solvers. Second, inspired by earlier work, we show that +alternating optimization of the even and odd layers accelerates memory +retrieval by a factor close to two. Combined, these two techniques allow for a +much faster energy minimization, as shown in our proof-of-concept experimental +results. The code is available at https://github.com/cgoemaere/hamdeq + +
+
+ comment: Accepted at the "Associative Memory & Hopfield Networks'' workshop at + NeurIPS, 2023 +
+
+
+
+
+ + ☆ Regularization by Texts for Latent Diffusion Inverse Solvers + + +
+ The recent advent of diffusion models has led to significant progress in +solving inverse problems, leveraging these models as effective generative +priors. Nonetheless, challenges related to the ill-posed nature of such +problems remain, often due to inherent ambiguities in measurements. Drawing +inspiration from the human ability to resolve visual ambiguities through +perceptual biases, here we introduce a novel latent diffusion inverse solver by +incorporating regularization by texts (TReg). Specifically, TReg applies the +textual description of the preconception of the solution during the reverse +sampling phase, of which description isndynamically reinforced through +null-text optimization for adaptive negation. Our comprehensive experimental +results demonstrate that TReg successfully mitigates ambiguity in latent +diffusion inverse solvers, enhancing their effectiveness and accuracy. + +
+
+
+
+
+ + ☆ Universal Event Detection in Time Series + + +
+ In our previously published work, we introduced a supervised deep learning +method for event detection in multivariate time series data, employing +regression instead of binary classification. This simplification avoids the +need for point-wise labels throughout the entire dataset, relying solely on +ground truth events defined as time points or intervals. In this paper, we +establish mathematically that our method is universal, and capable of detecting +any type of event with arbitrary precision under mild continuity assumptions on +the time series. These events may encompass change points, frauds, anomalies, +physical occurrences, and more. We substantiate our theoretical results using +the universal approximation theorem for feed-forward neural networks (FFN). +Additionally, we provide empirical validations that confirm our claims, +demonstrating that our method, with a limited number of parameters, outperforms +other deep learning approaches, particularly for rare events and imbalanced +datasets from different domains. + +
+
+ comment: To be submitted to IEEE Transactions on Neural Networks and Learning + Systems +
+
+
+
+
+ + ☆ RoboGPT: an intelligent agent of making embodied long-term decisions for + daily instruction tasks + + +
+ Robotic agents must master common sense and long-term sequential decisions to +solve daily tasks through natural language instruction. The developments in +Large Language Models (LLMs) in natural language processing have inspired +efforts to use LLMs in complex robot planning. Despite LLMs' great +generalization and comprehension of instruction tasks, LLMs-generated task +plans sometimes lack feasibility and correctness. To address the problem, we +propose a RoboGPT agent\footnote{our code and dataset will be released soon} +for making embodied long-term decisions for daily tasks, with two modules: 1) +LLMs-based planning with re-plan to break the task into multiple sub-goals; 2) +RoboSkill individually designed for sub-goals to learn better navigation and +manipulation skills. The LLMs-based planning is enhanced with a new robotic +dataset and re-plan, called RoboGPT. The new robotic dataset of 67k daily +instruction tasks is gathered for fine-tuning the Llama model and obtaining +RoboGPT. RoboGPT planner with strong generalization can plan hundreds of daily +instruction tasks. Additionally, a low-computational Re-Plan module is designed +to allow plans to flexibly adapt to the environment, thereby addressing the +nomenclature diversity challenge. The proposed RoboGPT agent outperforms SOTA +methods on the ALFRED daily tasks. Moreover, RoboGPT planner exceeds SOTA +LLM-based planners like ChatGPT in task-planning rationality for hundreds of +unseen daily tasks, and even other domain tasks, while keeping the large +model's original broad application and generality. + +
+
+
+
+
+ + ☆ Reinforcement Learning from Diffusion Feedback: Q* for Image Search + + +
+ Large vision-language models are steadily gaining personalization +capabilities at the cost of fine-tuning or data augmentation. We present two +models for image generation using model-agnostic learning that align semantic +priors with generative capabilities. RLDF, or Reinforcement Learning from +Diffusion Feedback, is a singular approach for visual imitation through +prior-preserving reward function guidance. This employs Q-learning (with +standard Q*) for generation and follows a semantic-rewarded trajectory for +image search through finite encoding-tailored actions. The second proposed +method, noisy diffusion gradient, is optimization driven. At the root of both +methods is a special CFG encoding that we propose for continual semantic +guidance. Using only a single input image and no text input, RLDF generates +high-quality images over varied domains including retail, sports and +agriculture showcasing class-consistency and strong visual diversity. Project +website is available at https://infernolia.github.io/RLDF. + +
+
+
+
+
+ + ☆ Bandits Meet Mechanism Design to Combat Clickbait in Online + Recommendation + + +
+ We study a strategic variant of the multi-armed bandit problem, which we coin +the strategic click-bandit. This model is motivated by applications in online +recommendation where the choice of recommended items depends on both the +click-through rates and the post-click rewards. Like in classical bandits, +rewards follow a fixed unknown distribution. However, we assume that the +click-rate of each arm is chosen strategically by the arm (e.g., a host on +Airbnb) in order to maximize the number of times it gets clicked. The algorithm +designer does not know the post-click rewards nor the arms' actions (i.e., +strategically chosen click-rates) in advance, and must learn both values over +time. To solve this problem, we design an incentive-aware learning algorithm, +UCB-S, which achieves two goals simultaneously: (a) incentivizing desirable arm +behavior under uncertainty; (b) minimizing regret by learning unknown +parameters. We characterize all approximate Nash equilibria among arms under +UCB-S and show a $\tilde{\mathcal{O}} (\sqrt{KT})$ regret bound uniformly in +every equilibrium. We also show that incentive-unaware algorithms generally +fail to achieve low regret in the strategic click-bandit. Finally, we support +our theoretical results by simulations of strategic arm behavior which confirm +the effectiveness and robustness of our proposed incentive design. + +
+
+
+
+
+ + ☆ Injecting linguistic knowledge into BERT for Dialogue State Tracking + + +
+ Dialogue State Tracking (DST) models often employ intricate neural network +architectures, necessitating substantial training data, and their inference +processes lack transparency. This paper proposes a method that extracts +linguistic knowledge via an unsupervised framework and subsequently utilizes +this knowledge to augment BERT's performance and interpretability in DST tasks. +The knowledge extraction procedure is computationally economical and does not +necessitate annotations or additional training data. The injection of the +extracted knowledge necessitates the addition of only simple neural modules. We +employ the Convex Polytopic Model (CPM) as a feature extraction tool for DST +tasks and illustrate that the acquired features correlate with the syntactic +and semantic patterns in the dialogues. This correlation facilitates a +comprehensive understanding of the linguistic features influencing the DST +model's decision-making process. We benchmark this framework on various DST +tasks and observe a notable improvement in accuracy. + +
+
+
+
+
+ + ☆ VeryFL: A Verify Federated Learning Framework Embedded with Blockchain + + +
+ Blockchain-empowered federated learning (FL) has provoked extensive research +recently. Various blockchain-based federated learning algorithm, architecture +and mechanism have been designed to solve issues like single point failure and +data falsification brought by centralized FL paradigm. Moreover, it is easier +to allocate incentives to nodes with the help of the blockchain. Various +centralized federated learning frameworks like FedML, have emerged in the +community to help boost the research on FL. However, decentralized +blockchain-based federated learning framework is still missing, which cause +inconvenience for researcher to reproduce or verify the algorithm performance +based on blockchain. Inspired by the above issues, we have designed and +developed a blockchain-based federated learning framework by embedding Ethereum +network. This report will present the overall structure of this framework, +which proposes a code practice paradigm for the combination of FL with +blockchain and, at the same time, compatible with normal FL training task. In +addition to implement some blockchain federated learning algorithms on smart +contract to help execute a FL training, we also propose a model ownership +authentication architecture based on blockchain and model watermarking to +protect the intellectual property rights of models. These mechanism on +blockchain shows an underlying support of blockchain for federated learning to +provide a verifiable training, aggregation and incentive distribution procedure +and thus we named this framework VeryFL (A Verify Federated Learninig Framework +Embedded with Blockchain). The source code is avaliable on +https://github.com/GTMLLab/VeryFL. + +
+
+
+
+
+ + ☆ Bayesian Approach to Linear Bayesian Networks + + +
+ This study proposes the first Bayesian approach for learning high-dimensional +linear Bayesian networks. The proposed approach iteratively estimates each +element of the topological ordering from backward and its parent using the +inverse of a partial covariance matrix. The proposed method successfully +recovers the underlying structure when Bayesian regularization for the inverse +covariance matrix with unequal shrinkage is applied. Specifically, it shows +that the number of samples $n = \Omega( d_M^2 \log p)$ and $n = \Omega(d_M^2 +p^{2/m})$ are sufficient for the proposed algorithm to learn linear Bayesian +networks with sub-Gaussian and 4m-th bounded-moment error distributions, +respectively, where $p$ is the number of nodes and $d_M$ is the maximum degree +of the moralized graph. The theoretical findings are supported by extensive +simulation studies including real data analysis. Furthermore the proposed +method is demonstrated to outperform state-of-the-art frequentist approaches, +such as the BHLSM, LISTEN, and TD algorithms in synthetic data. + +
+
+
+
+
+ + ☆ A manometric feature descriptor with linear-SVM to distinguish + esophageal contraction vigor + + +
+ n clinical, if a patient presents with nonmechanical obstructive dysphagia, +esophageal chest pain, and gastro esophageal reflux symptoms, the physician +will usually assess the esophageal dynamic function. High-resolution manometry +(HRM) is a clinically commonly used technique for detection of esophageal +dynamic function comprehensively and objectively. However, after the results of +HRM are obtained, doctors still need to evaluate by a variety of parameters. +This work is burdensome, and the process is complex. We conducted image +processing of HRM to predict the esophageal contraction vigor for assisting the +evaluation of esophageal dynamic function. Firstly, we used Feature-Extraction +and Histogram of Gradients (FE-HOG) to analyses feature of proposal of swallow +(PoS) to further extract higher-order features. Then we determine the +classification of esophageal contraction vigor normal, weak and failed by using +linear-SVM according to these features. Our data set includes 3000 training +sets, 500 validation sets and 411 test sets. After verification our accuracy +reaches 86.83%, which is higher than other common machine learning methods. + +
+
+
+
+
+ + ☆ QuickDrop: Efficient Federated Unlearning by Integrated Dataset + Distillation + + +
+ Federated Unlearning (FU) aims to delete specific training data from an ML +model trained using Federated Learning (FL). We introduce QuickDrop, an +efficient and original FU method that utilizes dataset distillation (DD) to +accelerate unlearning and drastically reduces computational overhead compared +to existing approaches. In QuickDrop, each client uses DD to generate a compact +dataset representative of the original training dataset, called a distilled +dataset, and uses this compact dataset during unlearning. To unlearn specific +knowledge from the global model, QuickDrop has clients execute Stochastic +Gradient Ascent with samples from the distilled datasets, thus significantly +reducing computational overhead compared to conventional FU methods. We further +increase the efficiency of QuickDrop by ingeniously integrating DD into the FL +training process. By reusing the gradient updates produced during FL training +for DD, the overhead of creating distilled datasets becomes close to +negligible. Evaluations on three standard datasets show that, with comparable +accuracy guarantees, QuickDrop reduces the duration of unlearning by 463.8x +compared to model retraining from scratch and 65.1x compared to existing FU +approaches. We also demonstrate the scalability of QuickDrop with 100 clients +and show its effectiveness while handling multiple unlearning operations. + +
+
+
+
+
+ + ☆ Optimal Clustering of Discrete Mixtures: Binomial, Poisson, Block + Models, and Multi-layer Networks + + +
+ In this paper, we first study the fundamental limit of clustering networks +when a multi-layer network is present. Under the mixture multi-layer stochastic +block model (MMSBM), we show that the minimax optimal network clustering error +rate, which takes an exponential form and is characterized by the Renyi +divergence between the edge probability distributions of the component +networks. We propose a novel two-stage network clustering method including a +tensor-based initialization algorithm involving both node and sample splitting +and a refinement procedure by likelihood-based Lloyd algorithm. Network +clustering must be accompanied by node community detection. Our proposed +algorithm achieves the minimax optimal network clustering error rate and allows +extreme network sparsity under MMSBM. Numerical simulations and real data +experiments both validate that our method outperforms existing methods. +Oftentimes, the edges of networks carry count-type weights. We then extend our +methodology and analysis framework to study the minimax optimal clustering +error rate for mixture of discrete distributions including Binomial, Poisson, +and multi-layer Poisson networks. The minimax optimal clustering error rates in +these discrete mixtures all take the same exponential form characterized by the +Renyi divergences. These optimal clustering error rates in discrete mixtures +can also be achieved by our proposed two-stage clustering algorithm. + +
+
+
+
+
+ + ☆ UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio, + Video, Point Cloud, Time-Series and Image Recognition + + +
+ Large-kernel convolutional neural networks (ConvNets) have recently received +extensive research attention, but there are two unresolved and critical issues +that demand further investigation. 1) The architectures of existing +large-kernel ConvNets largely follow the design principles of conventional +ConvNets or transformers, while the architectural design for large-kernel +ConvNets remains under-addressed. 2) As transformers have dominated multiple +modalities, it remains to be investigated whether ConvNets also have a strong +universal perception ability in domains beyond vision. In this paper, we +contribute from two aspects. 1) We propose four architectural guidelines for +designing large-kernel ConvNets, the core of which is to exploit the essential +characteristics of large kernels that distinguish them from small kernels - +they can see wide without going deep. Following such guidelines, our proposed +large-kernel ConvNet shows leading performance in image recognition. For +example, our models achieve an ImageNet accuracy of 88.0%, ADE20K mIoU of +55.6%, and COCO box AP of 56.4%, demonstrating better performance and higher +speed than a number of recently proposed powerful competitors. 2) We discover +that large kernels are the key to unlocking the exceptional performance of +ConvNets in domains where they were originally not proficient. With certain +modality-related preprocessing approaches, the proposed model achieves +state-of-the-art performance on time-series forecasting and audio recognition +tasks even without modality-specific customization to the architecture. Code +and all the models at https://github.com/AILab-CVC/UniRepLKNet. + +
+
+ comment: Code, all the models and reproducible training scripts at + https://github.com/AILab-CVC/UniRepLKNet +
+
+
+
+
+ + ☆ Quantum Langevin Dynamics for Optimization + + +
+ We initiate the study of utilizing Quantum Langevin Dynamics (QLD) to solve +optimization problems, particularly those non-convex objective functions that +present substantial obstacles for traditional gradient descent algorithms. +Specifically, we examine the dynamics of a system coupled with an infinite heat +bath. This interaction induces both random quantum noise and a deterministic +damping effect to the system, which nudge the system towards a steady state +that hovers near the global minimum of objective functions. We theoretically +prove the convergence of QLD in convex landscapes, demonstrating that the +average energy of the system can approach zero in the low temperature limit +with an exponential decay rate correlated with the evolution time. Numerically, +we first show the energy dissipation capability of QLD by retracing its origins +to spontaneous emission. Furthermore, we conduct detailed discussion of the +impact of each parameter. Finally, based on the observations when comparing QLD +with classical Fokker-Plank-Smoluchowski equation, we propose a time-dependent +QLD by making temperature and $\hbar$ time-dependent parameters, which can be +theoretically proven to converge better than the time-independent case and also +outperforms a series of state-of-the-art quantum and classical optimization +algorithms in many non-convex landscapes. + +
+
+ comment: 33 pages, 1 table, 26 figures +
+
+
+
+
+ + ☆ A deep learning approach for marine snow synthesis and removal + + +
+ Marine snow, the floating particles in underwater images, severely degrades +the visibility and performance of human and machine vision systems. This paper +proposes a novel method to reduce the marine snow interference using deep +learning techniques. We first synthesize realistic marine snow samples by +training a Generative Adversarial Network (GAN) model and combine them with +natural underwater images to create a paired dataset. We then train a U-Net +model to perform marine snow removal as an image to image translation task. Our +experiments show that the U-Net model can effectively remove both synthetic and +natural marine snow with high accuracy, outperforming state-of-the-art methods +such as the Median filter and its adaptive variant. We also demonstrate the +robustness of our method by testing it on the MSRB dataset, which contains +synthetic artifacts that our model has not seen during training. Our method is +a practical and efficient solution for enhancing underwater images affected by +marine snow. + +
+
+
+
+
+ + ☆ A Simple Geometric-Aware Indoor Positioning Interpolation Algorithm + Based on Manifold Learning + + +
+ Interpolation methodologies have been widely used within the domain of indoor +positioning systems. However, existing indoor positioning interpolation +algorithms exhibit several inherent limitations, including reliance on complex +mathematical models, limited flexibility, and relatively low precision. To +enhance the accuracy and efficiency of indoor positioning interpolation +techniques, this paper proposes a simple yet powerful geometric-aware +interpolation algorithm for indoor positioning tasks. The key to our algorithm +is to exploit the geometric attributes of the local topological manifold using +manifold learning principles. Therefore, instead of constructing complicated +mathematical models, the proposed algorithm facilitates the more precise and +efficient estimation of points grounded in the local topological manifold. +Moreover, our proposed method can be effortlessly integrated into any indoor +positioning system, thereby bolstering its adaptability. Through a systematic +array of experiments and comprehensive performance analyses conducted on both +simulated and real-world datasets, we demonstrate that the proposed algorithm +consistently outperforms the most commonly used and representative +interpolation approaches regarding interpolation accuracy and efficiency. +Furthermore, the experimental results also underscore the substantial practical +utility of our method and its potential applicability in real-time indoor +positioning scenarios. + +
+
+
+
+
+ + ☆ Lightly Weighted Automatic Audio Parameter Extraction for the Quality + Assessment of Consensus Auditory-Perceptual Evaluation of Voice + + +
+ The Consensus Auditory-Perceptual Evaluation of Voice is a widely employed +tool in clinical voice quality assessment that is significant for streaming +communication among clinical professionals and benchmarking for the +determination of further treatment. Currently, because the assessment relies on +experienced clinicians, it tends to be inconsistent, and thus, difficult to +standardize. To address this problem, we propose to leverage lightly weighted +automatic audio parameter extraction, to increase the clinical relevance, +reduce the complexity, and enhance the interpretability of voice quality +assessment. The proposed method utilizes age, sex, and five audio parameters: +jitter, absolute jitter, shimmer, harmonic-to-noise ratio (HNR), and zero +crossing. A classical machine learning approach is employed. The result reveals +that our approach performs similar to state-of-the-art (SOTA) methods, and +outperforms the latent representation obtained by using popular audio +pre-trained models. This approach provide insights into the feasibility of +different feature extraction approaches for voice evaluation. Audio parameters +such as jitter and the HNR are proven to be suitable for characterizing voice +quality attributes, such as roughness and strain. Conversely, pre-trained +models exhibit limitations in effectively addressing noise-related scorings. +This study contributes toward more comprehensive and precise voice quality +evaluations, achieved by a comprehensively exploring diverse assessment +methodologies. + +
+
+ comment: Published in IEEE 42th International Conference on Consumer + Electronics (ICCE 2024) +
+
+
+
+
+ + ☆ Experimental Analysis of Large-scale Learnable Vector Storage + Compression + + +
+ Learnable embedding vector is one of the most important applications in +machine learning, and is widely used in various database-related domains. +However, the high dimensionality of sparse data in recommendation tasks and the +huge volume of corpus in retrieval-related tasks lead to a large memory +consumption of the embedding table, which poses a great challenge to the +training and deployment of models. Recent research has proposed various methods +to compress the embeddings at the cost of a slight decrease in model quality or +the introduction of other overheads. Nevertheless, the relative performance of +these methods remains unclear. Existing experimental comparisons only cover a +subset of these methods and focus on limited metrics. In this paper, we perform +a comprehensive comparative analysis and experimental evaluation of embedding +compression. We introduce a new taxonomy that categorizes these techniques +based on their characteristics and methodologies, and further develop a modular +benchmarking framework that integrates 14 representative methods. Under a +uniform test environment, our benchmark fairly evaluates each approach, +presents their strengths and weaknesses under different memory budgets, and +recommends the best method based on the use case. In addition to providing +useful guidelines, our study also uncovers the limitations of current methods +and suggests potential directions for future research. + +
+
+
+
+
+ + ☆ UFDA: Universal Federated Domain Adaptation with Practical Assumptions AAAI2024 + + +
+ Conventional Federated Domain Adaptation (FDA) approaches usually demand an +abundance of assumptions, such as label set consistency, which makes them +significantly less feasible for real-world situations and introduces security +hazards. In this work, we propose a more practical scenario named Universal +Federated Domain Adaptation (UFDA). It only requires the black-box model and +the label set information of each source domain, while the label sets of +different source domains could be inconsistent and the target-domain label set +is totally blind. This relaxes the assumptions made by FDA, which are often +challenging to meet in real-world cases and diminish model security. To address +the UFDA scenario, we propose a corresponding framework called Hot-Learning +with Contrastive Label Disambiguation (HCLD), which tackles UFDA's domain +shifts and category gaps problem by using one-hot outputs from the black-box +models of various source domains. Moreover, to better distinguish the shared +and unknown classes, we further present a cluster-level strategy named +Mutual-Voting Decision (MVD) to extract robust consensus knowledge across peer +classes from both source and target domains. The extensive experiments on three +benchmarks demonstrate that our HCLD achieves comparable performance for our +UFDA scenario with much fewer assumptions, compared to the previous +methodologies with many additional assumptions. + +
+
+ comment: Submitted to AAAI2024 +
+
+
+
+
+ + ☆ SpotServe: Serving Generative Large Language Models on Preemptible + Instances ASPLOS 2024 + + +
+ The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them cheaply. This paper aims to +reduce the monetary cost for serving LLMs by leveraging preemptible GPU +instances on modern clouds, which offer accesses to spare GPUs at a much +cheaper price than regular instances but may be preempted by the cloud at any +time. Serving LLMs on preemptible instances requires addressing challenges +induced by frequent instance preemptions and the necessity of migrating +instances to handle these preemptions. + This paper presents SpotServe, the first distributed LLM serving system on +preemptible instances. Several key techniques in SpotServe realize fast and +reliable serving of generative LLMs on cheap preemptible instances. First, +SpotServe dynamically adapts the LLM parallelization configuration for dynamic +instance availability and fluctuating workload, while balancing the trade-off +among the overall throughput, inference latency and monetary costs. Second, to +minimize the cost of migrating instances for dynamic reparallelization, the +task of migrating instances is formulated as a bipartite graph matching +problem, which uses the Kuhn-Munkres algorithm to identify an optimal migration +plan that minimizes communications. Finally, to take advantage of the grace +period offered by modern clouds, we introduce stateful inference recovery, a +new inference mechanism that commits inference progress at a much finer +granularity and allows SpotServe to cheaply resume inference upon preemption. +We evaluate on real spot instance preemption traces and various popular LLMs +and show that SpotServe can reduce the P99 tail latency by 2.4 - 9.1x compared +with the best existing LLM serving systems. We also show that SpotServe can +leverage the price advantage of preemptive instances, saving 54% monetary cost +compared with only using on-demand instances. + +
+
+ comment: ASPLOS 2024 +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing + AI-Generated Text + + +
+ My research investigates the use of cutting-edge hybrid deep learning models +to accurately differentiate between AI-generated text and human writing. I +applied a robust methodology, utilising a carefully selected dataset comprising +AI and human texts from various sources, each tagged with instructions. +Advanced natural language processing techniques facilitated the analysis of +textual features. Combining sophisticated neural networks, the custom model +enabled it to detect nuanced differences between AI and human content. + +
+
+
+
+
+ + ☆ Instruct2Attack: Language-Guided Semantic Adversarial Attacks + + +
+ We propose Instruct2Attack (I2A), a language-guided semantic attack that +generates semantically meaningful perturbations according to free-form language +instructions. We make use of state-of-the-art latent diffusion models, where we +adversarially guide the reverse diffusion process to search for an adversarial +latent code conditioned on the input image and text instruction. Compared to +existing noise-based and semantic attacks, I2A generates more natural and +diverse adversarial examples while providing better controllability and +interpretability. We further automate the attack process with GPT-4 to generate +diverse image-specific text instructions. We show that I2A can successfully +break state-of-the-art deep neural networks even under strong adversarial +defenses, and demonstrate great transferability among a variety of network +architectures. + +
+
+ comment: under submission, code coming soon +
+
+
+
+
+ + ☆ From Prediction to Action: The Critical Role of Proper Performance + Estimation for Machine-Learning-Driven Materials Discovery + + +
+ Materials discovery driven by statistical property models is an iterative +decision process, during which an initial data collection is extended with new +data proposed by a model-informed acquisition function--with the goal to +maximize a certain "reward" over time, such as the maximum property value +discovered so far. While the materials science community achieved much progress +in developing property models that predict well on average with respect to the +training distribution, this form of in-distribution performance measurement is +not directly coupled with the discovery reward. This is because an iterative +discovery process has a shifting reward distribution that is +over-proportionally determined by the model performance for exceptional +materials. We demonstrate this problem using the example of bulk modulus +maximization among double perovskite oxides. We find that the in-distribution +predictive performance suggests random forests as superior to Gaussian process +regression, while the results are inverse in terms of the discovery rewards. We +argue that the lack of proper performance estimation methods from pre-computed +data collections is a fundamental problem for improving data-driven materials +discovery, and we propose a novel such estimator that, in contrast to na\"ive +reward estimation, successfully predicts Gaussian processes with the "expected +improvement" acquisition function as the best out of four options in our +demonstrational study for double perovskites. Importantly, it does so without +requiring the over thousand ab initio computations that were needed to confirm +this prediction. + +
+
+
+
+
+ + ☆ Deficiency of Large Language Models in Finance: An Empirical Examination + of Hallucination + + +
+ The hallucination issue is recognized as a fundamental deficiency of large +language models (LLMs), especially when applied to fields such as finance, +education, and law. Despite the growing concerns, there has been a lack of +empirical investigation. In this paper, we provide an empirical examination of +LLMs' hallucination behaviors in financial tasks. First, we empirically +investigate LLM model's ability of explaining financial concepts and +terminologies. Second, we assess LLM models' capacity of querying historical +stock prices. Third, to alleviate the hallucination issue, we evaluate the +efficacy of four practical methods, including few-shot learning, Decoding by +Contrasting Layers (DoLa), the Retrieval Augmentation Generation (RAG) method +and the prompt-based tool learning method for a function to generate a query +command. Finally, our major finding is that off-the-shelf LLMs experience +serious hallucination behaviors in financial tasks. Therefore, there is an +urgent need to call for research efforts in mitigating LLMs' hallucination. + +
+
+
+
+
+ + ☆ Dataset Distillation in Latent Space + + +
+ Dataset distillation (DD) is a newly emerging research area aiming at +alleviating the heavy computational load in training models on large datasets. +It tries to distill a large dataset into a small and condensed one so that +models trained on the distilled dataset can perform comparably with those +trained on the full dataset when performing downstream tasks. Among the +previous works in this area, there are three key problems that hinder the +performance and availability of the existing DD methods: high time complexity, +high space complexity, and low info-compactness. In this work, we +simultaneously attempt to settle these three problems by moving the DD +processes from conventionally used pixel space to latent space. Encoded by a +pretrained generic autoencoder, latent codes in the latent space are naturally +info-compact representations of the original images in much smaller sizes. +After transferring three mainstream DD algorithms to latent space, we +significantly reduce time and space consumption while achieving similar +performance, allowing us to distill high-resolution datasets or target at +greater data ratio that previous methods have failed. Besides, within the same +storage budget, we can also quantitatively deliver more latent codes than +pixel-level images, which further boosts the performance of our methods. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Out-of-Distribution Generalized Dynamic Graph Neural Network for Human + Albumin Prediction + + +
+ Human albumin is essential for indicating the body's overall health. +Accurately predicting plasma albumin levels and determining appropriate doses +are urgent clinical challenges, particularly in critically ill patients, to +maintain optimal blood levels. However, human albumin prediction is non-trivial +that has to leverage the dynamics of biochemical markers as well as the +experience of treating patients. Moreover, the problem of distribution shift is +often encountered in real clinical data, which may lead to a decline in the +model prediction performance and reduce the reliability of the model's +application. In this paper, we propose a framework named Out-of-Distribution +Generalized Dynamic Graph Neural Network for Human Albumin Prediction +(DyG-HAP), which is able to provide accurate albumin predictions for Intensity +Care Unit (ICU) patients during hospitalization. We first model human albumin +prediction as a dynamic graph regression problem to model the dynamics and +patient relationship. Then, we propose a disentangled dynamic graph attention +mechanism to capture and disentangle the patterns whose relationship to labels +under distribution shifts is invariant and variant respectively. Last, we +propose an invariant dynamic graph regression method to encourage the model to +rely on invariant patterns to make predictions. Moreover, we propose a dataset +named Albumin level testing and nutritional dosing data for Intensive Care +(ANIC) for evaluation. Extensive experiments demonstrate the superiority of our +method compared to several baseline methods in human albumin prediction. + +
+
+ comment: MedAI'23 +
+
+
+
+
+ + ☆ SVRDA: A Web-based Dataset Annotation Tool for Slice-to-Volume + Registration + + +
+ Background and Objective: The lack of benchmark datasets has impeded the +development of slice-to-volume registration algorithms. Such datasets are +difficult to annotate, primarily due to the dimensional difference within data +and the dearth of task-specific software. We aim to develop a user-friendly +tool to streamline dataset annotation for slice-to-volume registration. + Methods: The proposed tool, named SVRDA, is an installation-free web +application for platform-agnostic collaborative dataset annotation. It enables +efficient transformation manipulation via keyboard shortcuts and smooth case +transitions with auto-saving. SVRDA supports configuration-based data loading +and adheres to the separation of concerns, offering great flexibility and +extensibility for future research. Various supplementary features have been +implemented to facilitate slice-to-volume registration. + Results: We validated the effectiveness of SVRDA by indirectly evaluating the +post-registration segmentation quality on UK Biobank data, observing a dramatic +overall improvement (24.02% in the Dice Similarity Coefficient and 48.93% in +the 95th percentile Hausdorff distance, respectively) supported by highly +statistically significant evidence ($p<0.001$).We further showcased the +clinical usage of SVRDA by integrating it into test-retest T1 quantification on +in-house magnetic resonance images, leading to more consistent results after +registration. + Conclusions: SVRDA can facilitate collaborative annotation of benchmark +datasets while being potentially applicable to other pipelines incorporating +slice-to-volume registration. Full source code and documentation are available +at https://github.com/Roldbach/SVRDA + +
+
+ comment: 18 pages, 11 figures, In submission to Computer Methods and Programs + in Biomedicine +
+
+
+
+
+ + ☆ SSIN: Self-Supervised Learning for Rainfall Spatial Interpolation SIGMOD 2023 + + +
+ The acquisition of accurate rainfall distribution in space is an important +task in hydrological analysis and natural disaster pre-warning. However, it is +impossible to install rain gauges on every corner. Spatial interpolation is a +common way to infer rainfall distribution based on available raingauge data. +However, the existing works rely on some unrealistic pre-settings to capture +spatial correlations, which limits their performance in real scenarios. To +tackle this issue, we propose the SSIN, which is a novel data-driven +self-supervised learning framework for rainfall spatial interpolation by mining +latent spatial patterns from historical observation data. Inspired by the Cloze +task and BERT, we fully consider the characteristics of spatial interpolation +and design the SpaFormer model based on the Transformer architecture as the +core of SSIN. Our main idea is: by constructing rich self-supervision signals +via random masking, SpaFormer can learn informative embeddings for raw data and +then adaptively model spatial correlations based on rainfall spatial context. +Extensive experiments on two real-world raingauge datasets show that our method +outperforms the state-of-the-art solutions. In addition, we take traffic +spatial interpolation as another use case to further explore the performance of +our method, and SpaFormer achieves the best performance on one large real-world +traffic dataset, which further confirms the effectiveness and generality of our +method. + +
+
+ comment: SIGMOD 2023 Data-intensive Applications (DIA) Track; Code is + available at https://github.com/jlidw/SSIN +
+
+
+
+
+ + ☆ Active Foundational Models for Fault Diagnosis of Electrical Motors + + +
+ Fault detection and diagnosis of electrical motors are of utmost importance +in ensuring the safe and reliable operation of several industrial systems. +Detection and diagnosis of faults at the incipient stage allows corrective +actions to be taken in order to reduce the severity of faults. The existing +data-driven deep learning approaches for machine fault diagnosis rely +extensively on huge amounts of labeled samples, where annotations are expensive +and time-consuming. However, a major portion of unlabeled condition monitoring +data is not exploited in the training process. To overcome this limitation, we +propose a foundational model-based Active Learning framework that utilizes less +amount of labeled samples, which are most informative and harnesses a large +amount of available unlabeled data by effectively combining Active Learning and +Contrastive Self-Supervised Learning techniques. It consists of a transformer +network-based backbone model trained using an advanced nearest-neighbor +contrastive self-supervised learning method. This approach empowers the +backbone to learn improved representations of samples derived from raw, +unlabeled vibration data. Subsequently, the backbone can undergo fine-tuning to +address a range of downstream tasks, both within the same machines and across +different machines. The effectiveness of the proposed methodology has been +assessed through the fine-tuning of the backbone for multiple target tasks +using three distinct machine-bearing fault datasets. The experimental +evaluation demonstrates a superior performance as compared to existing +state-of-the-art fault diagnosis methods with less amount of labeled data. + +
+
+ comment: 30 pages, 2 figures, 7 tables +
+
+
+
+
+ + ☆ A Comparative and Experimental Study on Automatic Question Answering + Systems and its Robustness against Word Jumbling + + +
+ Question answer generation using Natural Language Processing models is +ubiquitous in the world around us. It is used in many use cases such as the +building of chat bots, suggestive prompts in google search and also as a way of +navigating information in banking mobile applications etc. It is highly +relevant because a frequently asked questions (FAQ) list can only have a finite +amount of questions but a model which can perform question answer generation +could be able to answer completely new questions that are within the scope of +the data. This helps us to be able to answer new questions accurately as long +as it is a relevant question. In commercial applications, it can be used to +increase customer satisfaction and ease of usage. However a lot of data is +generated by humans so it is susceptible to human error and this can adversely +affect the model's performance and we are investigating this through our work + +
+
+
+
+
+ + ☆ Learning with Complementary Labels Revisited: A Consistent Approach via + Negative-Unlabeled Learning + + +
+ Complementary-label learning is a weakly supervised learning problem in which +each training example is associated with one or multiple complementary labels +indicating the classes to which it does not belong. Existing consistent +approaches have relied on the uniform distribution assumption to model the +generation of complementary labels, or on an ordinary-label training set to +estimate the transition matrix. However, both conditions may not be satisfied +in real-world scenarios. In this paper, we propose a novel complementary-label +learning approach that does not rely on these conditions. We find that +complementary-label learning can be expressed as a set of negative-unlabeled +binary classification problems when using the one-versus-rest strategy. This +observation allows us to propose a risk-consistent approach with theoretical +guarantees. Furthermore, we introduce a risk correction approach to address +overfitting problems when using complex models. We also prove the statistical +consistency and convergence rate of the corrected risk estimator. Extensive +experimental results on both synthetic and real-world benchmark datasets +validate the superiority of our proposed approach over state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Function-constrained Program Synthesis NeurIPS + + +
+ This work introduces (1) a technique that allows large language models (LLMs) +to leverage user-provided code when solving programming tasks and (2) a method +to iteratively generate modular sub-functions that can aid future code +generation attempts when the initial code generated by the LLM is inadequate. +Generating computer programs in general-purpose programming languages like +Python poses a challenge for LLMs when instructed to use code provided in the +prompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code +completions in real-time by drawing on all code available in a development +environment. However, restricting code-specific LLMs to use only in-context +code is not straightforward, as the model is not explicitly instructed to use +the user-provided code and users cannot highlight precisely which snippets of +code the model should incorporate into its context. Moreover, current systems +lack effective recovery methods, forcing users to iteratively re-prompt the +model with modified prompts until a sufficient solution is reached. Our method +differs from traditional LLM-powered code-generation by constraining +code-generation to an explicit function set and enabling recovery from failed +attempts through automatically generated sub-functions. When the LLM cannot +produce working code, we generate modular sub-functions to aid subsequent +attempts at generating functional code. A by-product of our method is a library +of reusable sub-functions that can solve related tasks, imitating a software +team where efficiency scales with experience. We also introduce a new +"half-shot" evaluation paradigm that provides tighter estimates of LLMs' coding +abilities compared to traditional zero-shot evaluation. Our proposed evaluation +method encourages models to output solutions in a structured format, decreasing +syntax errors that can be mistaken for poor coding ability. + +
+
+ comment: 17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop +
+
+
+
+
+ + ☆ Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning + and Optimization Functions for Enhanced Precision + + +
+ Image registration has traditionally been done using two distinct approaches: +learning based methods, relying on robust deep neural networks, and +optimization-based methods, applying complex mathematical transformations to +warp images accordingly. Of course, both paradigms offer advantages and +disadvantages, and, in this work, we seek to combine their respective strengths +into a single streamlined framework, using the outputs of the learning based +method as initial parameters for optimization while prioritizing computational +power for the image pairs that offer the greatest loss. Our investigations +showed that an improvement of 0.3\% in testing when utilizing the best +performing state-of-the-art model as the backbone of the framework, while +maintaining the same inference time and with only a 0.8\% loss in deformation +field smoothness. + +
+
+
+
+
+ + ☆ Global $\mathcal{L}^2$ minimization with certainty via geometrically + adapted gradient descent in Deep Learning + + +
+ We consider the gradient descent flow widely used for the minimization of the +$\mathcal{L}^2$ cost function in Deep Learning networks, and introduce two +modified versions; one adapted for the overparametrized setting, and the other +for the underparametrized setting. Both have a clear and natural invariant +geometric meaning, taking into account the pullback vector bundle structure in +the overparametrized, and the pushforward vector bundle structure in the +underparametrized setting. In the overparametrized case, we prove that, +provided that a rank condition holds, all orbits of the modified gradient +descent drive the $\mathcal{L}^2$ cost to its global minimum at a uniform +exponential convergence rate. We point out relations of the latter to +sub-Riemannian geometry. + +
+
+ comment: AMS Latex, 12 pages +
+
+
+
+
+ + ☆ Learning Multimodal Latent Dynamics for Human-Robot Interaction + + +
+ This article presents a method for learning well-coordinated Human-Robot +Interaction (HRI) from Human-Human Interactions (HHI). We devise a hybrid +approach using Hidden Markov Models (HMMs) as the latent space priors for a +Variational Autoencoder to model a joint distribution over the interacting +agents. We leverage the interaction dynamics learned from HHI to learn HRI and +incorporate the conditional generation of robot motions from human observations +into the training, thereby predicting more accurate robot trajectories. The +generated robot motions are further adapted with Inverse Kinematics to ensure +the desired physical proximity with a human, combining the ease of joint space +learning and accurate task space reachability. For contact-rich interactions, +we modulate the robot's stiffness using HMM segmentation for a compliant +interaction. We verify the effectiveness of our approach deployed on a Humanoid +robot via a user study. Our method generalizes well to various humans despite +being trained on data from just two humans. We find that Users perceive our +method as more human-like, timely, and accurate and rank our method with a +higher degree of preference over other baselines. + +
+
+ comment: 20 Pages, 10 Figures +
+
+
+
+
+ + ☆ Bayesian Formulations for Graph Spectral Denoising + + +
+ We consider noisy signals which are defined on the vertices of a graph and +present smoothing algorithms for the cases of Gaussian, dropout, and uniformly +distributed noise. The signals are assumed to follow a prior distribution +defined in the frequency domain which favors signals which are smooth across +the edges of the graph. By pairing this prior distribution with our three +models of noise generation, we propose \textit{Maximum A Posteriori} (M.A.P.) +estimates of the true signal in the presence of noisy data and provide +algorithms for computing the M.A.P. Finally, we demonstrate the algorithms' +ability to effectively restore white noise on image data, and from severe +dropout in toy \& EHR data. + +
+
+
+
+
+ + ☆ Physics-Informed Neural Network for Discovering Systems with + Unmeasurable States with Application to Lithium-Ion Batteries + + +
+ Combining machine learning with physics is a trending approach for +discovering unknown dynamics, and one of the most intensively studied +frameworks is the physics-informed neural network (PINN). However, PINN often +fails to optimize the network due to its difficulty in concurrently minimizing +multiple losses originating from the system's governing equations. This problem +can be more serious when the system's states are unmeasurable, like lithium-ion +batteries (LiBs). In this work, we introduce a robust method for training PINN +that uses fewer loss terms and thus constructs a less complex landscape for +optimization. In particular, instead of having loss terms from each +differential equation, this method embeds the dynamics into a loss function +that quantifies the error between observed and predicted system outputs. This +is accomplished by numerically integrating the predicted states from the neural +network(NN) using known dynamics and transforming them to obtain a sequence of +predicted outputs. Minimizing such a loss optimizes the NN to predict states +consistent with observations given the physics. Further, the system's +parameters can be added to the optimization targets. To demonstrate the ability +of this method to perform various modeling and control tasks, we apply it to a +battery model to concurrently estimate its states and parameters. + +
+
+ comment: 7 pages, 4 figure, submitted to American Control Conference 2024 +
+
+
+
+
+ + ♻ ☆ FutureHuman3D: Forecasting Complex Long-Term 3D Human Behavior from + Video Observations + + +
+ We present a generative approach to forecast long-term future human behavior +in 3D, requiring only weak supervision from readily available 2D human action +data. This is a fundamental task enabling many downstream applications. The +required ground-truth data is hard to capture in 3D (mocap suits, expensive +setups) but easy to acquire in 2D (simple RGB cameras). Thus, we design our +method to only require 2D RGB data while being able to generate 3D human motion +sequences. We use a differentiable 2D projection scheme in an autoregressive +manner for weak supervision, and an adversarial loss for 3D regularization. Our +method predicts long and complex behavior sequences (e.g. cooking, assembly) +consisting of multiple sub-actions. We tackle this in a semantically +hierarchical manner, jointly predicting high-level coarse action labels +together with their low-level fine-grained realizations as characteristic 3D +human poses. We observe that these two action representations are coupled in +nature, and joint prediction benefits both action and pose forecasting. Our +experiments demonstrate the complementary nature of joint action and 3D pose +prediction: our joint approach outperforms each task treated individually, +enables robust longer-term sequence prediction, and outperforms alternative +approaches to forecast actions and characteristic 3D poses. + +
+
+ comment: Project Page: https://future-human-3d.christian-diller.de/ Video: + https://www.youtube.com/watch?v=18du85YFXL0 +
+
+
+
+
+ + ♻ ☆ Machine learning-based decentralized TDMA for VLC IoT networks + + +
+ In this paper, a machine learning-based decentralized time division multiple +access (TDMA) algorithm for visible light communication (VLC) Internet of +Things (IoT) networks is proposed. The proposed algorithm is based on +Q-learning, a reinforcement learning algorithm. This paper considers a +decentralized condition in which there is no coordinator node for sending +synchronization frames and assigning transmission time slots to other nodes. +The proposed algorithm uses a decentralized manner for synchronization, and +each node uses the Q-learning algorithm to find the optimal transmission time +slot for sending data without collisions. The proposed algorithm is implemented +on a VLC hardware system, which had been designed and implemented in our +laboratory. Average reward, convergence time, goodput, average delay, and data +packet size are evaluated parameters. The results show that the proposed +algorithm converges quickly and provides collision-free decentralized TDMA for +the network. The proposed algorithm is compared with carrier-sense multiple +access with collision avoidance (CSMA/CA) algorithm as a potential selection +for decentralized VLC IoT networks. The results show that the proposed +algorithm provides up to 61% more goodput and up to 49% less average delay than +CSMA/CA. + +
+
+ comment: This work has been submitted to a journal for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Self-Guided Diffusion Models CVPR 2023 + + +
+ Diffusion models have demonstrated remarkable progress in image generation +quality, especially when guidance is used to control the generative process. +However, guidance requires a large amount of image-annotation pairs for +training and is thus dependent on their availability, correctness and +unbiasedness. In this paper, we eliminate the need for such annotation by +instead leveraging the flexibility of self-supervision signals to design a +framework for self-guided diffusion models. By leveraging a feature extraction +function and a self-annotation function, our method provides guidance signals +at various image granularities: from the level of holistic images to object +boxes and even segmentation masks. Our experiments on single-label and +multi-label image datasets demonstrate that self-labeled guidance always +outperforms diffusion models without guidance and may even surpass guidance +based on ground-truth labels, especially on unbalanced data. When equipped with +self-supervised box or mask proposals, our method further generates visually +diverse yet semantically consistent images, without the need for any class, +box, or segment label annotation. Self-guided diffusion is simple, flexible and +expected to profit from deployment at scale. Source code will be at: +https://taohu.me/sgdm/ + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ♻ ☆ A deep reinforcement learning model for predictive maintenance planning + of road assets: Integrating LCA and LCCA + + +
+ Road maintenance planning is an integral part of road asset management. One +of the main challenges in Maintenance and Rehabilitation (M&R) practices is to +determine maintenance type and timing. This research proposes a framework using +Reinforcement Learning (RL) based on the Long Term Pavement Performance (LTPP) +database to determine the type and timing of M&R practices. A predictive DNN +model is first developed in the proposed algorithm, which serves as the +Environment for the RL algorithm. For the Policy estimation of the RL model, +both DQN and PPO models are developed. However, PPO has been selected in the +end due to better convergence and higher sample efficiency. Indicators used in +this study are International Roughness Index (IRI) and Rutting Depth (RD). +Initially, we considered Cracking Metric (CM) as the third indicator, but it +was then excluded due to the much fewer data compared to other indicators, +which resulted in lower accuracy of the results. Furthermore, in +cost-effectiveness calculation (reward), we considered both the economic and +environmental impacts of M&R treatments. Costs and environmental impacts have +been evaluated with paLATE 2.0 software. Our method is tested on a hypothetical +case study of a six-lane highway with 23 kilometers length located in Texas, +which has a warm and wet climate. The results propose a 20-year M&R plan in +which road condition remains in an excellent condition range. Because the early +state of the road is at a good level of service, there is no need for heavy +maintenance practices in the first years. Later, after heavy M&R actions, there +are several 1-2 years of no need for treatments. All of these show that the +proposed plan has a logical result. Decision-makers and transportation agencies +can use this scheme to conduct better maintenance practices that can prevent +budget waste and, at the same time, minimize the environmental impacts. + +
+
+
+
+
+ + ♻ ☆ Online Estimation and Optimization of Utility-Based Shortfall Risk + + +
+ Utility-Based Shortfall Risk (UBSR) is a risk metric that is increasingly +popular in financial applications, owing to certain desirable properties that +it enjoys. We consider the problem of estimating UBSR in a recursive setting, +where samples from the underlying loss distribution are available +one-at-a-time. We cast the UBSR estimation problem as a root finding problem, +and propose stochastic approximation-based estimations schemes. We derive +non-asymptotic bounds on the estimation error in the number of samples. We also +consider the problem of UBSR optimization within a parameterized class of +random variables. We propose a stochastic gradient descent based algorithm for +UBSR optimization, and derive non-asymptotic bounds on its convergence. + +
+
+
+
+
+ + ♻ ☆ DeepTSF: Codeless machine learning operations for time series + forecasting + + +
+ This paper presents DeepTSF, a comprehensive machine learning operations +(MLOps) framework aiming to innovate time series forecasting through workflow +automation and codeless modeling. DeepTSF automates key aspects of the ML +lifecycle, making it an ideal tool for data scientists and MLops engineers +engaged in machine learning (ML) and deep learning (DL)-based forecasting. +DeepTSF empowers users with a robust and user-friendly solution, while it is +designed to seamlessly integrate with existing data analysis workflows, +providing enhanced productivity and compatibility. The framework offers a +front-end user interface (UI) suitable for data scientists, as well as other +higher-level stakeholders, enabling comprehensive understanding through +insightful visualizations and evaluation metrics. DeepTSF also prioritizes +security through identity management and access authorization mechanisms. The +application of DeepTSF in real-life use cases of the I-NERGY project has +already proven DeepTSF's efficacy in DL-based load forecasting, showcasing its +significant added value in the electrical power and energy systems domain. + +
+
+
+
+
+ + ♻ ☆ ManiCast: Collaborative Manipulation with Cost-Aware Human Forecasting + + +
+ Seamless human-robot manipulation in close proximity relies on accurate +forecasts of human motion. While there has been significant progress in +learning forecast models at scale, when applied to manipulation tasks, these +models accrue high errors at critical transition points leading to degradation +in downstream planning performance. Our key insight is that instead of +predicting the most likely human motion, it is sufficient to produce forecasts +that capture how future human motion would affect the cost of a robot's plan. +We present ManiCast, a novel framework that learns cost-aware human forecasts +and feeds them to a model predictive control planner to execute collaborative +manipulation tasks. Our framework enables fluid, real-time interactions between +a human and a 7-DoF robot arm across a number of real-world tasks such as +reactive stirring, object handovers, and collaborative table setting. We +evaluate both the motion forecasts and the end-to-end forecaster-planner system +against a range of learned and heuristic baselines while additionally +contributing new datasets. We release our code and datasets at +https://portal-cornell.github.io/manicast/. + +
+
+ comment: CoRL 2023 +
+
+
+
+
+ + ♻ ☆ Low-degree learning and the metric entropy of polynomials + + +
+ Let $\mathscr{F}_{n,d}$ be the class of all functions $f:\{-1,1\}^n\to[-1,1]$ +on the $n$-dimensional discrete hypercube of degree at most $d$. In the first +part of this paper, we prove that any (deterministic or randomized) algorithm +which learns $\mathscr{F}_{n,d}$ with $L_2$-accuracy $\varepsilon$ requires at +least $\Omega((1-\sqrt{\varepsilon})2^d\log n)$ queries for large enough $n$, +thus establishing the sharpness as $n\to\infty$ of a recent upper bound of +Eskenazis and Ivanisvili (2021). To do this, we show that the $L_2$-packing +numbers $\mathsf{M}(\mathscr{F}_{n,d},\|\cdot\|_{L_2},\varepsilon)$ of the +concept class $\mathscr{F}_{n,d}$ satisfy the two-sided estimate +$$c(1-\varepsilon)2^d\log n \leq \log +\mathsf{M}(\mathscr{F}_{n,d},\|\cdot\|_{L_2},\varepsilon) \leq \frac{2^{Cd}\log +n}{\varepsilon^4}$$ for large enough $n$, where $c, C>0$ are universal +constants. In the second part of the paper, we present a logarithmic upper +bound for the randomized query complexity of classes of bounded approximate +polynomials whose Fourier spectra are concentrated on few subsets. As an +application, we prove new estimates for the number of random queries required +to learn approximate juntas of a given degree, functions with rapidly decaying +Fourier tails and constant depth circuits of given size. Finally, we obtain +bounds for the number of queries required to learn the polynomial class +$\mathscr{F}_{n,d}$ without error in the query and random example models. + +
+
+
+
+
+ + ♻ ☆ Deep Calibration of Market Simulations using Neural Density Estimators + and Embedding Networks + + +
+ The ability to construct a realistic simulator of financial exchanges, +including reproducing the dynamics of the limit order book, can give insight +into many counterfactual scenarios, such as a flash crash, a margin call, or +changes in macroeconomic outlook. In recent years, agent-based models have been +developed that reproduce many features of an exchange, as summarised by a set +of stylised facts and statistics. However, the ability to calibrate simulators +to a specific period of trading remains an open challenge. In this work, we +develop a novel approach to the calibration of market simulators by leveraging +recent advances in deep learning, specifically using neural density estimators +and embedding networks. We demonstrate that our approach is able to correctly +identify high probability parameter sets, both when applied to synthetic and +historical data, and without reliance on manually selected or weighted +ensembles of stylised facts. + +
+
+ comment: 4th ACM International Conference on AI in Finance (ICAIF 2023) +
+
+
+
+
+ + ♻ ☆ Optimal Approximation Rates for Deep ReLU Neural Networks on Sobolev and + Besov Spaces + + +
+ Let $\Omega = [0,1]^d$ be the unit cube in $\mathbb{R}^d$. We study the +problem of how efficiently, in terms of the number of parameters, deep neural +networks with the ReLU activation function can approximate functions in the +Sobolev spaces $W^s(L_q(\Omega))$ and Besov spaces $B^s_r(L_q(\Omega))$, with +error measured in the $L_p(\Omega)$ norm. This problem is important when +studying the application of neural networks in a variety of fields, including +scientific computing and signal processing, and has previously been solved only +when $p=q=\infty$. Our contribution is to provide a complete solution for all +$1\leq p,q\leq \infty$ and $s > 0$ for which the corresponding Sobolev or Besov +space compactly embeds into $L_p$. The key technical tool is a novel +bit-extraction technique which gives an optimal encoding of sparse vectors. +This enables us to obtain sharp upper bounds in the non-linear regime where $p +> q$. We also provide a novel method for deriving $L_p$-approximation lower +bounds based upon VC-dimension when $p < \infty$. Our results show that very +deep ReLU networks significantly outperform classical methods of approximation +in terms of the number of parameters, but that this comes at the cost of +parameters which are not encodable. + +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40\% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ On the Effectiveness of Log Representation for Log-based Anomaly + Detection + + +
+ Logs are an essential source of information for people to understand the +running status of a software system. Due to the evolving modern software +architecture and maintenance methods, more research efforts have been devoted +to automated log analysis. In particular, machine learning (ML) has been widely +used in log analysis tasks. In ML-based log analysis tasks, converting textual +log data into numerical feature vectors is a critical and indispensable step. +However, the impact of using different log representation techniques on the +performance of the downstream models is not clear, which limits researchers and +practitioners' opportunities of choosing the optimal log representation +techniques in their automated log analysis workflows. Therefore, this work +investigates and compares the commonly adopted log representation techniques +from previous log analysis research. Particularly, we select six log +representation techniques and evaluate them with seven ML models and four +public log datasets (i.e., HDFS, BGL, Spirit and Thunderbird) in the context of +log-based anomaly detection. We also examine the impacts of the log parsing +process and the different feature aggregation approaches when they are employed +with log representation techniques. From the experiments, we provide some +heuristic guidelines for future researchers and developers to follow when +designing an automated log analysis workflow. We believe our comprehensive +comparison of log representation techniques can help researchers and +practitioners better understand the characteristics of different log +representation techniques and provide them with guidance for selecting the most +suitable ones for their ML-based log analysis workflow. + +
+
+ comment: Accepted by Journal of Empirical Software Engineering (EMSE) +
+
+
+
+
+ + ♻ ☆ Machine learning and Topological data analysis identify unique features + of human papillae in 3D scans + + +
+ The tongue surface houses a range of papillae that are integral to the +mechanics and chemistry of taste and textural sensation. Although gustatory +function of papillae is well investigated, the uniqueness of papillae within +and across individuals remains elusive. Here, we present the first machine +learning framework on 3D microscopic scans of human papillae (n = 2092), +uncovering the uniqueness of geometric and topological features of papillae. +The finer differences in shapes of papillae are investigated computationally +based on a number of features derived from discrete differential geometry and +computational topology. Interpretable machine learning techniques show that +persistent homology features of the papillae shape are the most effective in +predicting the biological variables. Models trained on these features with +small volumes of data samples predict the type of papillae with an accuracy of +85%. The papillae type classification models can map the spatial arrangement of +filiform and fungiform papillae on a surface. Remarkably, the papillae are +found to be distinctive across individuals and an individual can be identified +with an accuracy of 48% among the 15 participants from a single papillae. +Collectively, this is the first unprecedented evidence demonstrating that +tongue papillae can serve as a unique identifier inspiring new research +direction for food preferences and oral diagnostics. + +
+
+
+
+
+ + ♻ ☆ AST: Effective Dataset Distillation through Alignment with Smooth and + High-Quality Expert Trajectories + + +
+ Training large AI models typically requires large-scale datasets in the +machine learning process, making training and parameter-tuning process both +time-consuming and costly. Some researchers address this problem by carefully +synthesizing a very small number of highly representative and informative +samples from real-world datasets. This approach, known as Dataset Distillation +(DD), proposes a perspective for data-efficient learning. Despite recent +progress in this field, the performance of existing methods still cannot meet +expectations, and distilled datasets cannot effectively replace original +datasets. In this paper, unlike previous methods that focus solely on improving +the effectiveness of student distillation, we recognize and leverage the +important mutual influence between expert and student models. We observed that +the smoothness of expert trajectories has a significant impact on subsequent +student parameter alignment. Based on this, we propose an effective DD +framework named AST, standing for Alignment with Smooth and high-quality expert +Trajectories. We devise the integration of clipping loss and gradient penalty +to regulate the rate of parameter changes in expert trajectory generation. To +further refine the student parameter alignment with expert trajectory, we put +forward representative initialization for the synthetic dataset and balanced +inner-loop loss in response to the sensitivity exhibited towards randomly +initialized variables during distillation. We also propose two enhancement +strategies, namely intermediate matching loss and weight perturbation, to +mitigate the potential occurrence of cumulative errors. We conduct extensive +experiments on datasets of different scales, sizes, and resolutions. The +results demonstrate that the proposed method significantly outperforms prior +methods. + +
+
+
+
+
+ + ♻ ☆ Understanding plasticity in neural networks ICML 2023 + + +
+ Plasticity, the ability of a neural network to quickly change its predictions +in response to new information, is essential for the adaptability and +robustness of deep reinforcement learning systems. Deep neural networks are +known to lose plasticity over the course of training even in relatively simple +learning problems, but the mechanisms driving this phenomenon are still poorly +understood. This paper conducts a systematic empirical analysis into plasticity +loss, with the goal of understanding the phenomenon mechanistically in order to +guide the future development of targeted solutions. We find that loss of +plasticity is deeply connected to changes in the curvature of the loss +landscape, but that it often occurs in the absence of saturated units. Based on +this insight, we identify a number of parameterization and optimization design +choices which enable networks to better preserve plasticity over the course of +training. We validate the utility of these findings on larger-scale RL +benchmarks in the Arcade Learning Environment. + +
+
+ comment: Accepted to ICML 2023 (oral presentation) +
+
+
+
+
+ + ♻ ☆ From Isolated Islands to Pangea: Unifying Semantic Space for Human + Action Understanding + + +
+ As a vital step toward the intelligent agent, Action understanding matters +for intelligent agents and has attracted long-term attention. It can be formed +as the mapping from the action physical space to the semantic space. Typically, +researchers built action datasets according to idiosyncratic choices to define +classes and push the envelope of benchmarks respectively. Thus, datasets are +incompatible with each other like "Isolated Islands" due to semantic gaps and +various class granularities, e.g., do housework in dataset A and wash plate in +dataset B. We argue that a more principled semantic space is an urgent need to +concentrate the community efforts and enable us to use all datasets together to +pursue generalizable action learning. To this end, we design a structured +action semantic space in view of verb taxonomy hierarchy and covering massive +actions. By aligning the classes of previous datasets to our semantic space, we +gather (image/video/skeleton/MoCap) datasets into a unified database in a +unified label system, i.e., bridging ``isolated islands'' into a "Pangea". +Accordingly, we propose a novel model mapping from the physical space to +semantic space to fully use Pangea. In extensive experiments, our new system +shows significant superiority, especially in transfer learning. Code and data +will be made publicly available. + +
+
+ comment: Project Webpage: https://mvig-rhos.com/pangea +
+
+
+
+
+ + ♻ ☆ Bayesian Flow Networks + + +
+ This paper introduces Bayesian Flow Networks (BFNs), a new class of +generative model in which the parameters of a set of independent distributions +are modified with Bayesian inference in the light of noisy data samples, then +passed as input to a neural network that outputs a second, interdependent +distribution. Starting from a simple prior and iteratively updating the two +distributions yields a generative procedure similar to the reverse process of +diffusion models; however it is conceptually simpler in that no forward process +is required. Discrete and continuous-time loss functions are derived for +continuous, discretised and discrete data, along with sample generation +procedures. Notably, the network inputs for discrete data lie on the +probability simplex, and are therefore natively differentiable, paving the way +for gradient-based sample guidance and few-step generation in discrete domains +such as language modelling. The loss function directly optimises data +compression and places no restrictions on the network architecture. In our +experiments BFNs achieve competitive log-likelihoods for image modelling on +dynamically binarized MNIST and CIFAR-10, and outperform all known discrete +diffusion models on the text8 character-level language modelling task. + +
+
+
+
+
+ + ♻ ☆ Asymptotic Bounds for Smoothness Parameter Estimates in Gaussian Process + Interpolation + + +
+ It is common to model a deterministic response function, such as the output +of a computer experiment, as a Gaussian process with a Mat\'ern covariance +kernel. The smoothness parameter of a Mat\'ern kernel determines many important +properties of the model in the large data limit, including the rate of +convergence of the conditional mean to the response function. We prove that the +maximum likelihood estimate of the smoothness parameter cannot asymptotically +undersmooth the truth when the data are obtained on a fixed bounded subset of +$\mathbb{R}^d$. That is, if the data-generating response function has Sobolev +smoothness $\nu_0 > d/2$, then the smoothness parameter estimate cannot be +asymptotically less than $\nu_0$. The lower bound is sharp. Additionally, we +show that maximum likelihood estimation recovers the true smoothness for a +class of compactly supported self-similar functions. For cross-validation we +prove an asymptotic lower bound $\nu_0 - d/2$, which however is unlikely to be +sharp. The results are based on approximation theory in Sobolev spaces and some +general theorems that restrict the set of values that the parameter estimators +can take. + +
+
+
+
+
+ + ♻ ☆ Dimensionality Reduction and Wasserstein Stability for Kernel Regression + + +
+ In a high-dimensional regression framework, we study consequences of the +naive two-step procedure where first the dimension of the input variables is +reduced and second, the reduced input variables are used to predict the output +variable with kernel regression. In order to analyze the resulting regression +errors, a novel stability result for kernel regression with respect to the +Wasserstein distance is derived. This allows us to bound errors that occur when +perturbed input data is used to fit the regression function. We apply the +general stability result to principal component analysis (PCA). Exploiting +known estimates from the literature on both principal component analysis and +kernel regression, we deduce convergence rates for the two-step procedure. The +latter turns out to be particularly useful in a semi-supervised setting. + +
+
+ comment: Forthcoming in JMLR +
+
+
+
+
+ + ♻ ☆ The Chosen One: Consistent Characters in Text-to-Image Diffusion Models + + +
+ Recent advances in text-to-image generation models have unlocked vast +potential for visual creativity. However, these models struggle with generation +of consistent characters, a crucial aspect for numerous real-world applications +such as story visualization, game development asset design, advertising, and +more. Current methods typically rely on multiple pre-existing images of the +target character or involve labor-intensive manual processes. In this work, we +propose a fully automated solution for consistent character generation, with +the sole input being a text prompt. We introduce an iterative procedure that, +at each stage, identifies a coherent set of images sharing a similar identity +and extracts a more consistent identity from this set. Our quantitative +analysis demonstrates that our method strikes a better balance between prompt +alignment and identity consistency compared to the baseline methods, and these +findings are reinforced by a user study. To conclude, we showcase several +practical applications of our approach. Project page is available at +https://omriavrahami.com/the-chosen-one + +
+
+ comment: Project page is available at https://omriavrahami.com/the-chosen-one +
+
+
+
+
+ + ♻ ☆ TorchRL: A data-driven decision-making library for PyTorch + + +
+ PyTorch has ascended as a premier machine learning framework, yet it lacks a +native and comprehensive library for decision and control tasks suitable for +large development teams dealing with complex real-world data and environments. +To address this issue, we propose TorchRL, a generalistic control library for +PyTorch that provides well-integrated, yet standalone components. We introduce +a new and flexible PyTorch primitive, the TensorDict, which facilitates +streamlined algorithm development across the many branches of Reinforcement +Learning (RL) and control. We provide a detailed description of the building +blocks and an extensive overview of the library across domains and tasks. +Finally, we experimentally demonstrate its reliability and flexibility and show +comparative benchmarks to demonstrate its computational efficiency. TorchRL +fosters long-term support and is publicly available on GitHub for greater +reproducibility and collaboration within the research community. The code is +open-sourced on GitHub. + +
+
+
+
+
+ + ♻ ☆ Energy Discrepancies: A Score-Independent Loss for Energy-Based Models NeurIPS 2023 + + +
+ Energy-based models are a simple yet powerful class of probabilistic models, +but their widespread adoption has been limited by the computational burden of +training them. We propose a novel loss function called Energy Discrepancy (ED) +which does not rely on the computation of scores or expensive Markov chain +Monte Carlo. We show that ED approaches the explicit score matching and +negative log-likelihood loss under different limits, effectively interpolating +between both. Consequently, minimum ED estimation overcomes the problem of +nearsightedness encountered in score-based estimation methods, while also +enjoying theoretical guarantees. Through numerical experiments, we demonstrate +that ED learns low-dimensional data distributions faster and more accurately +than explicit score matching or contrastive divergence. For high-dimensional +image data, we describe how the manifold hypothesis puts limitations on our +approach and demonstrate the effectiveness of energy discrepancy by training +the energy-based model as a prior of a variational decoder model. + +
+
+ comment: Camera Ready version for the 37th Conference on Neural Information + Processing Systems (NeurIPS 2023). Changes in this revision: Appendix A1: + Corrected proof of Theorem 1. Appendix D3: Added definition and numerical + experiments for energy discrepancy on binary discrete spaces. Minor changes + in the main text and correction of typos. Added new references +
+
+
+
+
+ + ♻ ☆ Technical Report: Large Language Models can Strategically Deceive their + Users when Put Under Pressure + + +
+ We demonstrate a situation in which Large Language Models, trained to be +helpful, harmless, and honest, can display misaligned behavior and +strategically deceive their users about this behavior without being instructed +to do so. Concretely, we deploy GPT-4 as an agent in a realistic, simulated +environment, where it assumes the role of an autonomous stock trading agent. +Within this environment, the model obtains an insider tip about a lucrative +stock trade and acts upon it despite knowing that insider trading is +disapproved of by company management. When reporting to its manager, the model +consistently hides the genuine reasons behind its trading decision. We perform +a brief investigation of how this behavior varies under changes to the setting, +such as removing model access to a reasoning scratchpad, attempting to prevent +the misaligned behavior by changing system instructions, changing the amount of +pressure the model is under, varying the perceived risk of getting caught, and +making other simple changes to the environment. To our knowledge, this is the +first demonstration of Large Language Models trained to be helpful, harmless, +and honest, strategically deceiving their users in a realistic situation +without direct instructions or training for deception. + +
+
+
+
+
+ + ♻ ☆ Assessing Deep Neural Networks as Probability Estimators + + +
+ Deep Neural Networks (DNNs) have performed admirably in classification tasks. +However, the characterization of their classification uncertainties, required +for certain applications, has been lacking. In this work, we investigate the +issue by assessing DNNs' ability to estimate conditional probabilities and +propose a framework for systematic uncertainty characterization. Denoting the +input sample as x and the category as y, the classification task of assigning a +category y to a given input x can be reduced to the task of estimating the +conditional probabilities p(y|x), as approximated by the DNN at its last layer +using the softmax function. Since softmax yields a vector whose elements all +fall in the interval (0, 1) and sum to 1, it suggests a probabilistic +interpretation to the DNN's outcome. Using synthetic and real-world datasets, +we look into the impact of various factors, e.g., probability density f(x) and +inter-categorical sparsity, on the precision of DNNs' estimations of p(y|x), +and find that the likelihood probability density and the inter-categorical +sparsity have greater impacts than the prior probability to DNNs' +classification uncertainty. + +
+
+ comment: Y. Pan, K. Kuo, M. Rilee and H. Yu, "Assessing Deep Neural Networks + as Probability Estimators," in 2021 IEEE International Conference on Big Data + (Big Data), Orlando, FL, USA, 2021 pp. 1083-1091. doi: + 10.1109/BigData52589.2021.9671328 +
+
+
+
+
+ + ♻ ☆ CALICO: Self-Supervised Camera-LiDAR Contrastive Pre-training for BEV + Perception + + +
+ Perception is crucial in the realm of autonomous driving systems, where +bird's eye view (BEV)-based architectures have recently reached +state-of-the-art performance. The desirability of self-supervised +representation learning stems from the expensive and laborious process of +annotating 2D and 3D data. Although previous research has investigated +pretraining methods for both LiDAR and camera-based 3D object detection, a +unified pretraining framework for multimodal BEV perception is missing. In this +study, we introduce CALICO, a novel framework that applies contrastive +objectives to both LiDAR and camera backbones. Specifically, CALICO +incorporates two stages: point-region contrast (PRC) and region-aware +distillation (RAD). PRC better balances the region- and scene-level +representation learning on the LiDAR modality and offers significant +performance improvement compared to existing methods. RAD effectively achieves +contrastive distillation on our self-trained teacher model. CALICO's efficacy +is substantiated by extensive evaluations on 3D object detection and BEV map +segmentation tasks, where it delivers significant performance improvements. +Notably, CALICO outperforms the baseline method by 10.5% and 8.6% on NDS and +mAP. Moreover, CALICO boosts the robustness of multimodal 3D object detection +against adversarial attacks and corruption. Additionally, our framework can be +tailored to different backbones and heads, positioning it as a promising +approach for multimodal BEV perception. + +
+
+
+
+
+ + ♻ ☆ RCT Rejection Sampling for Causal Estimation Evaluation + + +
+ Confounding is a significant obstacle to unbiased estimation of causal +effects from observational data. For settings with high-dimensional covariates +-- such as text data, genomics, or the behavioral social sciences -- +researchers have proposed methods to adjust for confounding by adapting machine +learning methods to the goal of causal estimation. However, empirical +evaluation of these adjustment methods has been challenging and limited. In +this work, we build on a promising empirical evaluation strategy that +simplifies evaluation design and uses real data: subsampling randomized +controlled trials (RCTs) to create confounded observational datasets while +using the average causal effects from the RCTs as ground-truth. We contribute a +new sampling algorithm, which we call RCT rejection sampling, and provide +theoretical guarantees that causal identification holds in the observational +data to allow for valid comparisons to the ground-truth RCT. Using synthetic +data, we show our algorithm indeed results in low bias when oracle estimators +are evaluated on the confounded samples, which is not always the case for a +previously proposed algorithm. In addition to this identification result, we +highlight several finite data considerations for evaluation designers who plan +to use RCT rejection sampling on their own datasets. As a proof of concept, we +implement an example evaluation pipeline and walk through these finite data +considerations with a novel, real-world RCT -- which we release publicly -- +consisting of approximately 70k observations and text data as high-dimensional +covariates. Together, these contributions build towards a broader agenda of +improved empirical evaluation for causal estimation. + +
+
+ comment: Code and data at https://github.com/kakeith/rct_rejection_sampling +
+
+
+
+
+ + ♻ ☆ Long-Range Neural Atom Learning for Molecular Graphs + + +
+ Graph Neural Networks (GNNs) have been widely adopted for drug discovery with +molecular graphs. Nevertheless, current GNNs are mainly good at leveraging +short-range interactions (SRI) but struggle to capture long-range interactions +(LRI), both of which are crucial for determining molecular properties. To +tackle this issue, we propose a method that implicitly projects all original +atoms into a few Neural Atoms, which abstracts the collective information of +atomic groups within a molecule. Specifically, we explicitly exchange the +information among neural atoms and project them back to the atoms' +representations as an enhancement. With this mechanism, neural atoms establish +the communication channels among distant nodes, effectively reducing the +interaction scope of arbitrary node pairs into a single hop. To provide an +inspection of our method from a physical perspective, we reveal its connection +with the traditional LRI calculation method, Ewald Summation. We conduct +extensive experiments on three long-range graph benchmarks, covering both +graph-level and link-level tasks on molecular graphs. We empirically justify +that our method can be equipped with an arbitrary GNN and help to capture LRI. + +
+
+
+
+
+ + ♻ ☆ AdaptGuard: Defending Against Universal Attacks for Model Adaptation ICCV2023 + + +
+ Model adaptation aims at solving the domain transfer problem under the +constraint of only accessing the pretrained source models. With the increasing +considerations of data privacy and transmission efficiency, this paradigm has +been gaining recent popularity. This paper studies the vulnerability to +universal attacks transferred from the source domain during model adaptation +algorithms due to the existence of malicious providers. We explore both +universal adversarial perturbations and backdoor attacks as loopholes on the +source side and discover that they still survive in the target models after +adaptation. To address this issue, we propose a model preprocessing framework, +named AdaptGuard, to improve the security of model adaptation algorithms. +AdaptGuard avoids direct use of the risky source parameters through knowledge +distillation and utilizes the pseudo adversarial samples under adjusted radius +to enhance the robustness. AdaptGuard is a plug-and-play module that requires +neither robust pretrained models nor any changes for the following model +adaptation algorithms. Extensive results on three commonly used datasets and +two popular adaptation methods validate that AdaptGuard can effectively defend +against universal attacks and maintain clean accuracy in the target domain +simultaneously. We hope this research will shed light on the safety and +robustness of transfer learning. Code is available at +https://github.com/TomSheng21/AdaptGuard. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ The Lipschitz-Variance-Margin Tradeoff for Enhanced Randomized Smoothing + + +
+ Real-life applications of deep neural networks are hindered by their unsteady +predictions when faced with noisy inputs and adversarial attacks. The certified +radius is in this context a crucial indicator of the robustness of models. +However how to design an efficient classifier with a sufficient certified +radius? Randomized smoothing provides a promising framework by relying on noise +injection in inputs to obtain a smoothed and more robust classifier. In this +paper, we first show that the variance introduced by randomized smoothing +closely interacts with two other important properties of the classifier, +\textit{i.e.} its Lipschitz constant and margin. More precisely, our work +emphasizes the dual impact of the Lipschitz constant of the base classifier, on +both the smoothed classifier and the empirical variance. Moreover, to increase +the certified robust radius, we introduce a different simplex projection +technique for the base classifier to leverage the variance-margin trade-off +thanks to Bernstein's concentration inequality, along with an enhanced +Lipschitz bound. Experimental results show a significant improvement in +certified accuracy compared to current state-of-the-art methods. Our novel +certification procedure allows us to use pre-trained models that are used with +randomized smoothing, effectively improving the current certification radius in +a zero-shot manner. + +
+
+
+
+
+ + ♻ ☆ Emerging Trends in Federated Learning: From Model Fusion to Federated X + Learning + + +
+ Federated learning is a new learning paradigm that decouples data collection +and model training via multi-party computation and model aggregation. As a +flexible learning setting, federated learning has the potential to integrate +with other learning frameworks. We conduct a focused survey of federated +learning in conjunction with other learning algorithms. Specifically, we +explore various learning algorithms to improve the vanilla federated averaging +algorithm and review model fusion methods such as adaptive aggregation, +regularization, clustered methods, and Bayesian methods. Following the emerging +trends, we also discuss federated learning in the intersection with other +learning paradigms, termed federated X learning, where X includes multitask +learning, meta-learning, transfer learning, unsupervised learning, and +reinforcement learning. This survey reviews the state of the art, challenges, +and future directions. + +
+
+
+
+
+ + ♻ ☆ Label Differential Privacy via Aggregation + + +
+ In many real-world applications, due to recent developments in the privacy +landscape, training data may be aggregated to preserve the privacy of sensitive +training labels. In the learning from label proportions (LLP) framework, the +dataset is partitioned into bags of feature-vectors which are available only +with the sum of the labels per bag. A further restriction, which we call +learning from bag aggregates (LBA) is where instead of individual +feature-vectors, only the (possibly weighted) sum of the feature-vectors per +bag is available. We study whether such aggregation techniques can provide +privacy guarantees under the notion of label differential privacy (label-DP) +previously studied in for e.g. [Chaudhuri-Hsu'11, Ghazi et al.'21, Esfandiari +et al.'22]. + It is easily seen that naive LBA and LLP do not provide label-DP. Our main +result however, shows that weighted LBA using iid Gaussian weights with $m$ +randomly sampled disjoint $k$-sized bags is in fact $(\varepsilon, +\delta)$-label-DP for any $\varepsilon > 0$ with $\delta \approx +\exp(-\Omega(\sqrt{k}))$ assuming a lower bound on the linear-mse regression +loss. Further, the $\ell_2^2$-regressor which minimizes the loss on the +aggregated dataset has a loss within $\left(1 + o(1)\right)$-factor of the +optimum on the original dataset w.p. $\approx 1 - exp(-\Omega(m))$. We +emphasize that no additive label noise is required. + The analogous weighted-LLP does not however admit label-DP. Nevertheless, we +show that if additive $N(0, 1)$ noise can be added to any constant fraction of +the instance labels, then the noisy weighted-LLP admits similar label-DP +guarantees without assumptions on the dataset, while preserving the utility of +Lipschitz-bounded neural mse-regression tasks. + Our work is the first to demonstrate that label-DP can be achieved by +randomly weighted aggregation for regression tasks, using no or little additive +noise. + +
+
+
+
+
+ + ♻ ☆ SLMIA-SR: Speaker-Level Membership Inference Attacks against Speaker + Recognition Systems NDSS + + +
+ Membership inference attacks allow adversaries to determine whether a +particular example was contained in the model's training dataset. While +previous works have confirmed the feasibility of such attacks in various +applications, none has focused on speaker recognition (SR), a promising +voice-based biometric recognition technique. In this work, we propose SLMIA-SR, +the first membership inference attack tailored to SR. In contrast to +conventional example-level attack, our attack features speaker-level membership +inference, i.e., determining if any voices of a given speaker, either the same +as or different from the given inference voices, have been involved in the +training of a model. It is particularly useful and practical since the training +and inference voices are usually distinct, and it is also meaningful +considering the open-set nature of SR, namely, the recognition speakers were +often not present in the training data. We utilize intra-similarity and +inter-dissimilarity, two training objectives of SR, to characterize the +differences between training and non-training speakers and quantify them with +two groups of features driven by carefully-established feature engineering to +mount the attack. To improve the generalizability of our attack, we propose a +novel mixing ratio training strategy to train attack models. To enhance the +attack performance, we introduce voice chunk splitting to cope with the limited +number of inference voices and propose to train attack models dependent on the +number of inference voices. Our attack is versatile and can work in both +white-box and black-box scenarios. Additionally, we propose two novel +techniques to reduce the number of black-box queries while maintaining the +attack performance. Extensive experiments demonstrate the effectiveness of +SLMIA-SR. + +
+
+ comment: In Proceedings of the 31st Network and Distributed System Security + (NDSS) Symposium, 2024 +
+
+
+
+
+ + ♻ ☆ The Map Equation Goes Neural + + +
+ Community detection and graph clustering are essential for unsupervised data +exploration and understanding the high-level organisation of networked systems. +Recently, graph clustering has received attention as a primary task for graph +neural networks. Although hierarchical graph pooling has been shown to improve +performance in graph and node classification tasks, it performs poorly in +identifying meaningful clusters. Community detection has a long history in +network science, but typically relies on optimising objective functions with +custom-tailored search algorithms, not leveraging recent advances in deep +learning, particularly from graph neural networks. In this paper, we narrow +this gap between the deep learning and network science communities. We consider +the map equation, an information-theoretic objective function for unsupervised +community detection. Expressing it in a fully differentiable tensor form that +produces soft cluster assignments, we optimise the map equation with deep +learning through gradient descent. More specifically, the reformulated map +equation is a loss function compatible with any graph neural network +architecture, enabling flexible clustering and graph pooling that clusters both +graph structure and data features in an end-to-end way, automatically finding +an optimum number of clusters without explicit regularisation by following the +minimum description length principle. We evaluate our approach experimentally +using different neural network architectures for unsupervised clustering in +synthetic and real data. Our results show that our approach achieves +competitive performance against baselines, naturally detects overlapping +communities, and avoids over-partitioning sparse graphs. + +
+
+
+
+
+ + ♻ ☆ Aggregating Capacity in FL through Successive Layer Training for + Computationally-Constrained Devices NeurIPS'23 + + +
+ Federated learning (FL) is usually performed on resource-constrained edge +devices, e.g., with limited memory for the computation. If the required memory +to train a model exceeds this limit, the device will be excluded from the +training. This can lead to a lower accuracy as valuable data and computation +resources are excluded from training, also causing bias and unfairness. The FL +training process should be adjusted to such constraints. The state-of-the-art +techniques propose training subsets of the FL model at constrained devices, +reducing their resource requirements for training. But these techniques largely +limit the co-adaptation among parameters of the model and are highly +inefficient, as we show: it is actually better to train a smaller (less +accurate) model by the system where all the devices can train the model +end-to-end, than applying such techniques. We propose a new method that enables +successive freezing and training of the parameters of the FL model at devices, +reducing the training's resource requirements at the devices, while still +allowing enough co-adaptation between parameters. We show through extensive +experimental evaluation that our technique greatly improves the accuracy of the +trained model (by 52.4 p.p.) compared with the state of the art, efficiently +aggregating the computation capacity available on distributed devices. + +
+
+ comment: accepted at NeurIPS'23 +
+
+
+
+
+ + ♻ ☆ An efficient likelihood-free Bayesian inference method based on + sequential neural posterior estimation + + +
+ Sequential neural posterior estimation (SNPE) techniques have been recently +proposed for dealing with simulation-based models with intractable likelihoods. +Unlike approximate Bayesian computation, SNPE techniques learn the posterior +from sequential simulation using neural network-based conditional density +estimators by minimizing a specific loss function. The SNPE method proposed by +Lueckmann et al. (2017) used a calibration kernel to boost the sample weights +around the observed data, resulting in a concentrated loss function. However, +the use of calibration kernels may increase the variances of both the empirical +loss and its gradient, making the training inefficient. To improve the +stability of SNPE, this paper proposes to use an adaptive calibration kernel +and several variance reduction techniques. The proposed method greatly speeds +up the process of training, and provides a better approximation of the +posterior than the original SNPE method and some existing competitors as +confirmed by numerical experiments. + +
+
+ comment: 30 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation + + +
+ Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced +significant growth and have been extensively employed to adapt large vision and +language models to various domains, enabling satisfactory model performance +with minimal computational needs. Despite these advances, more research has yet +to delve into potential PEFT applications in real-life scenarios, particularly +in the critical domains of remote sensing and crop monitoring. The diversity of +climates across different regions and the need for comprehensive large-scale +datasets have posed significant obstacles to accurately identify crop types +across varying geographic locations and changing growing seasons. This study +seeks to bridge this gap by comprehensively exploring the feasibility of +cross-area and cross-year out-of-distribution generalization using the +State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to +explore PEFT approaches for crop monitoring. Specifically, we focus on adapting +the SOTA TSViT model to address winter wheat field segmentation, a critical +task for crop monitoring and food security. This adaptation process involves +integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and +prompt tuning. Using PEFT techniques, we achieved notable results comparable to +those achieved using full fine-tuning methods while training only a mere 0.7% +parameters of the whole TSViT architecture. The in-house labeled data-set, +referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated +polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over +five consecutive years. Using Sentinel-2 images, our model achieved a 84% +F1-score. We intend to publicly release the Lebanese winter wheat data set, +code repository, and model weights. + +
+
+
+
+
+ + ♻ ☆ Computational and Storage Efficient Quadratic Neurons for Deep Neural + Networks DATE + + +
+ Deep neural networks (DNNs) have been widely deployed across diverse domains +such as computer vision and natural language processing. However, the +impressive accomplishments of DNNs have been realized alongside extensive +computational demands, thereby impeding their applicability on +resource-constrained devices. To address this challenge, many researchers have +been focusing on basic neuron structures, the fundamental building blocks of +neural networks, to alleviate the computational and storage cost. In this work, +an efficient quadratic neuron architecture distinguished by its enhanced +utilization of second-order computational information is introduced. By virtue +of their better expressivity, DNNs employing the proposed quadratic neurons can +attain similar accuracy with fewer neurons and computational cost. Experimental +results have demonstrated that the proposed quadratic neuron structure exhibits +superior computational and storage efficiency across various tasks when +compared with both linear and non-linear neurons in prior work. + +
+
+ comment: Accepted by Design Automation and Test in Europe (DATE) 2024 +
+
+
+
+
+ + ♻ ☆ RankFeat&RankWeight: Rank-1 Feature/Weight Removal for + Out-of-distribution Detection + + +
+ The task of out-of-distribution (OOD) detection is crucial for deploying +machine learning models in real-world settings. In this paper, we observe that +the singular value distributions of the in-distribution (ID) and OOD features +are quite different: the OOD feature matrix tends to have a larger dominant +singular value than the ID feature, and the class predictions of OOD samples +are largely determined by it. This observation motivates us to propose +\texttt{RankFeat}, a simple yet effective \emph{post hoc} approach for OOD +detection by removing the rank-1 matrix composed of the largest singular value +and the associated singular vectors from the high-level feature. +\texttt{RankFeat} achieves \emph{state-of-the-art} performance and reduces the +average false positive rate (FPR95) by 17.90\% compared with the previous best +method. The success of \texttt{RankFeat} motivates us to investigate whether a +similar phenomenon would exist in the parameter matrices of neural networks. We +thus propose \texttt{RankWeight} which removes the rank-1 weight from the +parameter matrices of a single deep layer. Our \texttt{RankWeight}is also +\emph{post hoc} and only requires computing the rank-1 matrix once. As a +standalone approach, \texttt{RankWeight} has very competitive performance +against other methods across various backbones. Moreover, \texttt{RankWeight} +enjoys flexible compatibility with a wide range of OOD detection methods. The +combination of \texttt{RankWeight} and \texttt{RankFeat} refreshes the new +\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\% on +the ImageNet-1k benchmark. Extensive ablation studies and comprehensive +theoretical analyses are presented to support the empirical results. + +
+
+ comment: submitted to T-PAMI. arXiv admin note: substantial text overlap with + arXiv:2209.08590 +
+
+
+
+
+ + ♻ ☆ Improved identification accuracy in equation learning via comprehensive + $\boldsymbol{R^2}$-elimination and Bayesian model selection + + +
+ In the field of equation learning, exhaustively considering all possible +equations derived from a basis function dictionary is infeasible. Sparse +regression and greedy algorithms have emerged as popular approaches to tackle +this challenge. However, the presence of multicollinearity poses difficulties +for sparse regression techniques, and greedy steps may inadvertently exclude +terms of the true equation, leading to reduced identification accuracy. In this +article, we present an approach that strikes a balance between +comprehensiveness and efficiency in equation learning. Inspired by stepwise +regression, our approach combines the coefficient of determination, $R^2$, and +the Bayesian model evidence, $p(\boldsymbol y|\mathcal M)$, in a novel way. Our +procedure is characterized by a comprehensive search with just a minor +reduction of the model space at each iteration step. With two flavors of our +approach and the adoption of $p(\boldsymbol y|\mathcal M)$ for bi-directional +stepwise regression, we present a total of three new avenues for equation +learning. Through three extensive numerical experiments involving random +polynomials and dynamical systems, we compare our approach against four +state-of-the-art methods and two standard approaches. The results demonstrate +that our comprehensive search approach surpasses all other methods in terms of +identification accuracy. In particular, the second flavor of our approach +establishes an efficient overfitting penalty solely based on $R^2$, which +achieves highest rates of exact equation recovery. + +
+
+ comment: 12 pages main text and 11 pages appendix, Published in TMLR + (https://openreview.net/forum?id=0ck7hJ8EVC) +
+
+
+
+
+ + ♻ ☆ Uncovering the Hidden Cost of Model Compression + + +
+ In the era of resource-intensive foundation models, efficient adaptation in +downstream tasks has become paramount. Visual Prompting (VP), inspired by +prompting in Large Language Models (LLMs), has emerged as a key transfer +learning method in computer vision. Aligned with the growing significance of +efficiency, research in model compression has become pivotal to alleviate the +computational burden in both training and deploying over-parameterized neural +networks. A key goal in model compression is the development of sparse models +capable of matching or surpassing the performance of their over-parameterized, +dense counterparts. While prior research has explored the impact of model +sparsity on transfer learning, its effects on visual prompting-based transfer +remain unclear. This study addresses this gap, revealing that model sparsity +adversely affects the performance of visual prompting-based transfer, +particularly in low-data-volume scenarios. Furthermore, our findings highlight +the negative influence of sparsity on the calibration of downstream +visual-prompted models. This empirical exploration calls for a nuanced +understanding beyond accuracy in sparse settings, opening avenues for further +research in Visual Prompting for sparse models. Code and logs can be accessed +at https://github.com/landskape-ai/Reprogram_LT . + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Artificial Neural Networks generated by Low Discrepancy Sequences + + +
+ Artificial neural networks can be represented by paths. Generated as random +walks on a dense network graph, we find that the resulting sparse networks +allow for deterministic initialization and even weights with fixed sign. Such +networks can be trained sparse from scratch, avoiding the expensive procedure +of training a dense network and compressing it afterwards. Although sparse, +weights are accessed as contiguous blocks of memory. In addition, enumerating +the paths using deterministic low discrepancy sequences, for example the Sobol' +sequence, amounts to connecting the layers of neural units by progressive +permutations, which naturally avoids bank conflicts in parallel computer +hardware. We demonstrate that the artificial neural networks generated by low +discrepancy sequences can achieve an accuracy within reach of their dense +counterparts at a much lower computational complexity. + +
+
+
+
+
+ + ♻ ☆ Efficient Gradient Estimation via Adaptive Sampling and Importance + Sampling + + +
+ Machine learning problems rely heavily on stochastic gradient descent (SGD) +for optimization. The effectiveness of SGD is contingent upon accurately +estimating gradients from a mini-batch of data samples. Instead of the commonly +used uniform sampling, adaptive or importance sampling reduces noise in +gradient estimation by forming mini-batches that prioritize crucial data +points. Previous research has suggested that data points should be selected +with probabilities proportional to their gradient norm. Nevertheless, existing +algorithms have struggled to efficiently integrate importance sampling into +machine learning frameworks. In this work, we make two contributions. First, we +present an algorithm that can incorporate existing importance functions into +our framework. Second, we propose a simplified importance function that relies +solely on the loss gradient of the output layer. By leveraging our proposed +gradient estimation techniques, we observe improved convergence in +classification and regression tasks with minimal computational overhead. We +validate the effectiveness of our adaptive and importance-sampling approach on +image and point-cloud datasets. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ A New Type Of Upper And Lower Bounds On Right-Tail Probabilities Of + Continuous Random Variables + + +
+ In this paper, I present a completely new type of upper and lower bounds on +the right-tail probabilities of continuous random variables with unbounded +support and with semi-bounded support from the left. The presented upper and +lower right-tail bounds depend only on the probability density function (PDF), +its first derivative, and two parameters that are used for tightening the +bounds. These tail bounds hold under certain conditions that depend on the PDF, +its first and second derivatives, and the two parameters. The new tail bounds +are shown to be tight for a wide range of continuous random variables via +numerical examples. + +
+
+ comment: Minor typos corrected v2 +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Pre-trained Language Models for Offline + Reinforcement Learning + + +
+ Offline reinforcement learning (RL) aims to find a near-optimal policy using +pre-collected datasets. In real-world scenarios, data collection could be +costly and risky; therefore, offline RL becomes particularly challenging when +the in-domain data is limited. Given recent advances in Large Language Models +(LLMs) and their few-shot learning prowess, this paper introduces +$\textbf{La}$nguage Models for $\textbf{Mo}$tion Control ($\textbf{LaMo}$), a +general framework based on Decision Transformers to effectively use pre-trained +Language Models (LMs) for offline RL. Our framework highlights four crucial +components: (1) Initializing Decision Transformers with sequentially +pre-trained LMs, (2) employing the LoRA fine-tuning method, in contrast to +full-weight fine-tuning, to combine the pre-trained knowledge from LMs and +in-domain knowledge effectively, (3) using the non-linear MLP transformation +instead of linear projections, to generate embeddings, and (4) integrating an +auxiliary language prediction loss during fine-tuning to stabilize the LMs and +retain their original abilities on languages. Empirical results indicate +$\textbf{LaMo}$ achieves state-of-the-art performance in sparse-reward tasks +and closes the gap between value-based offline RL methods and decision +transformers in dense-reward tasks. In particular, our method demonstrates +superior performance in scenarios with limited data samples. + +
+
+ comment: 24 pages, 16 tables +
+
+
+
+
+ + ♻ ☆ A Comparison of PDF Projection with Normalizing Flows and SurVAE + + +
+ Normalizing flows (NF) recently gained attention as a way to construct +generative networks with exact likelihood calculation out of composable layers. +However, NF is restricted to dimension-preserving transformations. Surjection +VAE (SurVAE) has been proposed to extend NF to dimension-altering +transformations. Such networks are desirable because they are expressive and +can be precisely trained. We show that the approaches are a re-invention of PDF +projection, which appeared over twenty years earlier and is much further +developed. + +
+
+
+
+
+ + ♻ ☆ Token-Level Adversarial Prompt Detection Based on Perplexity Measures + and Contextual Information + + +
+ In recent years, Large Language Models (LLM) have emerged as pivotal tools in +various applications. However, these models are susceptible to adversarial +prompt attacks, where attackers can carefully curate input strings that lead to +undesirable outputs. The inherent vulnerability of LLMs stems from their +input-output mechanisms, especially when presented with intensely +out-of-distribution (OOD) inputs. This paper proposes a token-level detection +method to identify adversarial prompts, leveraging the LLM's capability to +predict the next token's probability. We measure the degree of the model's +perplexity and incorporate neighboring token information to encourage the +detection of contiguous adversarial prompt sequences. As a result, we propose +two methods: one that identifies each token as either being part of an +adversarial prompt or not, and another that estimates the probability of each +token being part of an adversarial prompt. + +
+
+
+
+
+ + ♻ ☆ The Open DAC 2023 Dataset and Challenges for Sorbent Discovery in Direct + Air Capture + + +
+ New methods for carbon dioxide removal are urgently needed to combat global +climate change. Direct air capture (DAC) is an emerging technology to capture +carbon dioxide directly from ambient air. Metal-organic frameworks (MOFs) have +been widely studied as potentially customizable adsorbents for DAC. However, +discovering promising MOF sorbents for DAC is challenging because of the vast +chemical space to explore and the need to understand materials as functions of +humidity and temperature. We explore a computational approach benefiting from +recent innovations in machine learning (ML) and present a dataset named Open +DAC 2023 (ODAC23) consisting of more than 38M density functional theory (DFT) +calculations on more than 8,400 MOF materials containing adsorbed $CO_2$ and/or +$H_2O$. ODAC23 is by far the largest dataset of MOF adsorption calculations at +the DFT level of accuracy currently available. In addition to probing +properties of adsorbed molecules, the dataset is a rich source of information +on structural relaxation of MOFs, which will be useful in many contexts beyond +specific applications for DAC. A large number of MOFs with promising properties +for DAC are identified directly in ODAC23. We also trained state-of-the-art ML +models on this dataset to approximate calculations at the DFT level. This +open-source dataset and our initial ML models will provide an important +baseline for future efforts to identify MOFs for a wide range of applications, +including DAC. + +
+
+
+
+
+ + ♻ ☆ TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression + For On-device ASR Models ICASSP 2024 + + +
+ Automatic Speech Recognition (ASR) models need to be optimized for specific +hardware before they can be deployed on devices. This can be done by tuning the +model's hyperparameters or exploring variations in its architecture. +Re-training and re-validating models after making these changes can be a +resource-intensive task. This paper presents TODM (Train Once Deploy Many), a +new approach to efficiently train many sizes of hardware-friendly on-device ASR +models with comparable GPU-hours to that of a single training job. TODM +leverages insights from prior work on Supernet, where Recurrent Neural Network +Transducer (RNN-T) models share weights within a Supernet. It reduces layer +sizes and widths of the Supernet to obtain subnetworks, making them smaller +models suitable for all hardware types. We introduce a novel combination of +three techniques to improve the outcomes of the TODM Supernet: adaptive +dropouts, an in-place Alpha-divergence knowledge distillation, and the use of +ScaledAdam optimizer. We validate our approach by comparing Supernet-trained +versus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using +LibriSpeech. Results demonstrate that our TODM Supernet either matches or +surpasses the performance of manually tuned models by up to a relative of 3% +better in word error rate (WER), while efficiently keeping the cost of training +many models at a small constant. + +
+
+ comment: Meta AI; Submitted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ StratMed: Relevance Stratification between Biomedical Entities for + Sparsity on Medication Recommendation + + +
+ With the growing imbalance between limited medical resources and escalating +demands, AI-based clinical tasks have become paramount. As a sub-domain, +medication recommendation aims to amalgamate longitudinal patient history with +medical knowledge, assisting physicians in prescribing safer and more accurate +medication combinations. Existing works ignore the inherent long-tailed +distribution of medical data, have uneven learning strengths for hot and sparse +data, and fail to balance safety and accuracy. To address the above +limitations, we propose StratMed, which introduces a stratification strategy +that overcomes the long-tailed problem and achieves fuller learning of sparse +data. It also utilizes a dual-property network to address the issue of mutual +constraints on the safety and accuracy of medication combinations, +synergistically enhancing these two properties. Specifically, we construct a +pre-training method using deep learning networks to obtain medication and +disease representations. After that, we design a pyramid-like stratification +method based on relevance to strengthen the expressiveness of sparse data. +Based on this relevance, we design two graph structures to express medication +safety and precision at the same level to obtain patient representations. +Finally, the patient's historical clinical information is fitted to generate +medication combinations for the current health condition. We employed the +MIMIC-III dataset to evaluate our model against state-of-the-art methods in +three aspects comprehensively. Compared to the sub-optimal baseline model, our +model reduces safety risk by 15.08\%, improves accuracy by 0.36\%, and reduces +training time consumption by 81.66\%. + +
+
+
+
+
+ + ♻ ☆ Auto-PINN: Understanding and Optimizing Physics-Informed Neural + Architecture + + +
+ Physics-informed neural networks (PINNs) are revolutionizing science and +engineering practice by bringing together the power of deep learning to bear on +scientific computation. In forward modeling problems, PINNs are meshless +partial differential equation (PDE) solvers that can handle irregular, +high-dimensional physical domains. Naturally, the neural architecture +hyperparameters have a large impact on the efficiency and accuracy of the PINN +solver. However, this remains an open and challenging problem because of the +large search space and the difficulty of identifying a proper search objective +for PDEs. Here, we propose Auto-PINN, the first systematic, automated +hyperparameter optimization approach for PINNs, which employs Neural +Architecture Search (NAS) techniques to PINN design. Auto-PINN avoids manually +or exhaustively searching the hyperparameter space associated with PINNs. A +comprehensive set of pre-experiments using standard PDE benchmarks allows us to +probe the structure-performance relationship in PINNs. We find that the +different hyperparameters can be decoupled, and that the training loss function +of PINNs is a good search objective. Comparison experiments with baseline +methods demonstrate that Auto-PINN produces neural architectures with superior +stability and accuracy over alternative baselines. + +
+
+
+
+
+ + ♻ ☆ FedSoL: Bridging Global Alignment and Local Generality in Federated + Learning + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +client data distributions are heterogeneous. Many previous FL algorithms have +addressed this issue by introducing various proximal restrictions. These +restrictions aim to encourage global alignment by constraining the deviation of +local learning from the global objective. However, they inherently limit local +learning by interfering with the original local objectives. Recently, an +alternative approach has emerged to improve local learning generality. By +obtaining local models within a smooth loss landscape, this approach mitigates +conflicts among different local objectives of the clients. Yet, it does not +ensure stable global alignment, as local learning does not take the global +objective into account. In this study, we propose Federated Stability on +Learning (FedSoL), which combines both the concepts of global alignment and +local generality. In FedSoL, the local learning seeks a parameter region robust +against proximal perturbations. This strategy introduces an implicit proximal +restriction effect in local learning while maintaining the original local +objective for parameter update. Our experiments show that FedSoL consistently +achieves state-of-the-art performance on various setups. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ A review of ensemble learning and data augmentation models for class + imbalanced problems: combination, implementation and evaluation + + +
+ Class imbalance (CI) in classification problems arises when the number of +observations belonging to one class is lower than the other. Ensemble learning +combines multiple models to obtain a robust model and has been prominently used +with data augmentation methods to address class imbalance problems. In the last +decade, a number of strategies have been added to enhance ensemble learning and +data augmentation methods, along with new methods such as generative +adversarial networks (GANs). A combination of these has been applied in many +studies, and the evaluation of different combinations would enable a better +understanding and guidance for different application domains. In this paper, we +present a computational study to evaluate data augmentation and ensemble +learning methods used to address prominent benchmark CI problems. We present a +general framework that evaluates 9 data augmentation and 9 ensemble learning +methods for CI problems. Our objective is to identify the most effective +combination for improving classification performance on imbalanced datasets. +The results indicate that combinations of data augmentation methods with +ensemble learning can significantly improve classification performance on +imbalanced datasets. We find that traditional data augmentation methods such as +the synthetic minority oversampling technique (SMOTE) and random oversampling +(ROS) are not only better in performance for selected CI problems, but also +computationally less expensive than GANs. Our study is vital for the +development of novel models for handling imbalanced datasets. + +
+
+
+
+
+ + ♻ ☆ Redefining Super-Resolution: Fine-mesh PDE predictions without classical + simulations NeurIPS 2023 + + +
+ In Computational Fluid Dynamics (CFD), coarse mesh simulations offer +computational efficiency but often lack precision. Applying conventional +super-resolution to these simulations poses a significant challenge due to the +fundamental contrast between downsampling high-resolution images and +authentically emulating low-resolution physics. The former method conserves +more of the underlying physics, surpassing the usual constraints of real-world +scenarios. We propose a novel definition of super-resolution tailored for +PDE-based problems. Instead of simply downsampling from a high-resolution +dataset, we use coarse-grid simulated data as our input and predict fine-grid +simulated outcomes. Employing a physics-infused UNet upscaling method, we +demonstrate its efficacy across various 2D-CFD problems such as discontinuity +detection in Burger's equation, Methane combustion, and fouling in Industrial +heat exchangers. Our method enables the generation of fine-mesh solutions +bypassing traditional simulation, ensuring considerable computational saving +and fidelity to the original ground truth outcomes. Through diverse boundary +conditions during training, we further establish the robustness of our method, +paving the way for its broad applications in engineering and scientific CFD +solvers. + +
+
+ comment: Accepted at Machine Learning and the Physical Sciences Workshop, + NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Directional Privacy for Deep Learning + + +
+ Differentially Private Stochastic Gradient Descent (DP-SGD) is a key method +for applying privacy in the training of deep learning models. It applies +isotropic Gaussian noise to gradients during training, which can perturb these +gradients in any direction, damaging utility. Metric DP, however, can provide +alternative mechanisms based on arbitrary metrics that might be more suitable +for preserving utility. In this paper, we apply \textit{directional privacy}, +via a mechanism based on the von Mises-Fisher (VMF) distribution, to perturb +gradients in terms of \textit{angular distance} so that gradient direction is +broadly preserved. We show that this provides both $\epsilon$-DP and $\epsilon +d$-privacy for deep learning training, rather than the $(\epsilon, +\delta)$-privacy of the Gaussian mechanism. Experiments on key datasets then +indicate that the VMF mechanism can outperform the Gaussian in the +utility-privacy trade-off. In particular, our experiments provide a direct +empirical comparison of privacy between the two approaches in terms of their +ability to defend against reconstruction and membership inference. + +
+
+
+
+
+ + ♻ ☆ Big Data Analytics for Network Level Short-Term Travel Time Prediction + with Hierarchical LSTM + + +
+ The travel time data collected from widespread traffic monitoring sensors +necessitate big data analytic tools for querying, visualization, and +identifying meaningful traffic patterns. This paper utilizes a large-scale +travel time dataset from Caltrans Performance Measurement System (PeMS) system +that is an overflow for traditional data processing and modeling tools. To +overcome the challenges of the massive amount of data, the big data analytic +engines Apache Spark and Apache MXNet are applied for data wrangling and +modeling. Seasonality and autocorrelation were performed to explore and +visualize the trend of time-varying data. Inspired by the success of the +hierarchical architecture for many Artificial Intelligent (AI) tasks, we +consolidate the cell and hidden states passed from low-level to the high-level +LSTM with an attention pooling similar to how the human perception system +operates. The designed hierarchical LSTM model can consider the dependencies at +different time scales to capture the spatial-temporal correlations of +network-level travel time. Another self-attention module is then devised to +connect LSTM extracted features to the fully connected layers, predicting +travel time for all corridors instead of a single link/route. The comparison +results show that the Hierarchical LSTM with Attention (HierLSTMat) model gives +the best prediction results at 30-minute and 45-min horizons and can +successfully forecast unusual congestion. The efficiency gained from big data +analytic tools was evaluated by comparing them with popular data science and +deep learning frameworks. + +
+
+
+
+
+ + ♻ ☆ Mate! Are You Really Aware? An Explainability-Guided Testing Framework + for Robustness of Malware Detectors + + +
+ Numerous open-source and commercial malware detectors are available. However, +their efficacy is threatened by new adversarial attacks, whereby malware +attempts to evade detection, e.g., by performing feature-space manipulation. In +this work, we propose an explainability-guided and model-agnostic testing +framework for robustness of malware detectors when confronted with adversarial +attacks. The framework introduces the concept of Accrued Malicious Magnitude +(AMM) to identify which malware features could be manipulated to maximize the +likelihood of evading detection. We then use this framework to test several +state-of-the-art malware detectors' abilities to detect manipulated malware. We +find that (i) commercial antivirus engines are vulnerable to AMM-guided test +cases; (ii) the ability of a manipulated malware generated using one detector +to evade detection by another detector (i.e., transferability) depends on the +overlap of features with large AMM values between the different detectors; and +(iii) AMM values effectively measure the fragility of features (i.e., +capability of feature-space manipulation to flip the prediction results) and +explain the robustness of malware detectors facing evasion attacks. Our +findings shed light on the limitations of current malware detectors, as well as +how they can be improved. + +
+
+ comment: Accepted at ESEC/FSE 2023. https://doi.org/10.1145/3611643.3616309 +
+
+
+
+
+ + ♻ ☆ RACH-Space: Reconstructing Adaptive Convex Hull Space with applications + in weak supervision + + +
+ We introduce RACH-Space, a novel classification method in ensemble learning. +In particular, we show its applicability as a label model for weakly supervised +learning. RACH-Space offers simplicity in implementation with minimal +assumptions on the data or weak signals. The model is well suited for scenarios +where fully labeled data is not available. Our method is built upon geometrical +interpretation of the space spanned by weak signals. Our analysis of the high +dimensional convex hull structure underlying general set of weak signals +bridges geometry with machine learning. Empirical results also demonstrate that +RACH-Space works well in practice and compares favorably to best existing label +models for weakly supervised learning. + +
+
+ comment: 11 pages +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ Real Time GAZED: Online Shot Selection and Editing of Virtual Cameras + from Wide-Angle Monocular Video Recordings + + +
+ Eliminating time-consuming post-production processes and delivering +high-quality videos in today's fast-paced digital landscape are the key +advantages of real-time approaches. To address these needs, we present Real +Time GAZED: a real-time adaptation of the GAZED framework integrated with +CineFilter, a novel real-time camera trajectory stabilization approach. It +enables users to create professionally edited videos in real-time. Comparative +evaluations against baseline methods, including the non-real-time GAZED, +demonstrate that Real Time GAZED achieves similar editing results, ensuring +high-quality video output. Furthermore, a user study confirms the aesthetic +quality of the video edits produced by the Real Time GAZED approach. With these +advancements in real-time camera trajectory optimization and video editing +presented, the demand for immediate and dynamic content creation in industries +such as live broadcasting, sports coverage, news reporting, and social media +content creation can be met more efficiently. + +
+
+
+
+
+ + ☆ EAFP-Med: An Efficient Adaptive Feature Processing Module Based on + Prompts for Medical Image Detection + + +
+ In the face of rapid advances in medical imaging, cross-domain adaptive +medical image detection is challenging due to the differences in lesion +representations across various medical imaging technologies. To address this +issue, we draw inspiration from large language models to propose EAFP-Med, an +efficient adaptive feature processing module based on prompts for medical image +detection. EAFP-Med can efficiently extract lesion features of different scales +from a diverse range of medical images based on prompts while being flexible +and not limited by specific imaging techniques. Furthermore, it serves as a +feature preprocessing module that can be connected to any model front-end to +enhance the lesion features in input images. Moreover, we propose a novel +adaptive disease detection model named EAFP-Med ST, which utilizes the Swin +Transformer V2 - Tiny (SwinV2-T) as its backbone and connects it to EAFP-Med. +We have compared our method to nine state-of-the-art methods. Experimental +results demonstrate that EAFP-Med ST achieves the best performance on all three +datasets (chest X-ray images, cranial magnetic resonance imaging images, and +skin images). EAFP-Med can efficiently extract lesion features from various +medical images based on prompts, enhancing the model's performance. This holds +significant potential for improving medical image analysis and diagnosis. + +
+
+
+
+
+ + ☆ Automatic Time Signature Determination for New Scores Using Lyrics for + Latent Rhythmic Structure + + +
+ There has recently been a sharp increase in interest in Artificial +Intelligence-Generated Content (AIGC). Despite this, musical components such as +time signatures have not been studied sufficiently to form an algorithmic +determination approach for new compositions, especially lyrical songs. This is +likely because of the neglect of musical details, which is critical for +constructing a robust framework. Specifically, time signatures establish the +fundamental rhythmic structure for almost all aspects of a song, including the +phrases and notes. In this paper, we propose a novel approach that only uses +lyrics as input to automatically generate a fitting time signature for lyrical +songs and uncover the latent rhythmic structure utilizing explainable machine +learning models. In particular, we devise multiple methods that are associated +with discovering lyrical patterns and creating new features that simultaneously +contain lyrical, rhythmic, and statistical information. In this approach, the +best of our experimental results reveal a 97.6% F1 score and a 0.996 Area Under +the Curve (AUC) of the Receiver Operating Characteristic (ROC) score. In +conclusion, our research directly generates time signatures from lyrics +automatically for new scores utilizing machine learning, which is an innovative +idea that approaches an understudied component of musicology and therefore +contributes significantly to the future of Artificial Intelligence (AI) music +generation. + +
+
+ comment: Submitted to IEEE Big Data 2023 Conference +
+
+
+
+
+ + ☆ Removing NSFW Concepts from Vision-and-Language Models for Text-to-Image + Retrieval and Generation + + +
+ Vision-and-Language models such as CLIP have demonstrated remarkable +effectiveness across a wide range of tasks. However, these models are typically +trained on web-scale data, which can introduce inappropriate content and lead +to the development of unsafe and biased behavior. This, in turn, hampers their +applicability in sensitive and trustworthy contexts and could raise significant +concern in their adoption. To overcome these limitations, we introduce a +methodology to make Vision-and-Language models safer by removing their +sensitivity to not-safe-for-work concepts. We show how this can be done by +distilling from a large language model which converts between safe and unsafe +sentences and which is fine-tuned starting from just 100 manually-curated +pairs. We conduct extensive experiments on the resulting embedding space for +both retrieval and text-to-image generation, where we show that our model can +also be properly employed with pre-trained image generators. Our source code +and trained models are available at: https://github.com/aimagelab/safe-clip. + +
+
+
+
+
+ + ☆ IG Captioner: Information Gain Captioners are Strong Zero-shot + Classifiers + + +
+ Generative training has been demonstrated to be powerful for building +visual-language models. However, on zero-shot discriminative benchmarks, there +is still a performance gap between models trained with generative and +discriminative objectives. In this paper, we aim to narrow this gap by +improving the efficacy of generative training on classification tasks, without +any finetuning processes or additional modules. + Specifically, we focus on narrowing the gap between the generative captioner +and the CLIP classifier. We begin by analysing the predictions made by the +captioner and classifier and observe that the caption generation inherits the +distribution bias from the language model trained with pure text modality, +making it less grounded on the visual signal. To tackle this problem, we +redesign the scoring objective for the captioner to alleviate the +distributional bias and focus on measuring the gain of information brought by +the visual inputs. We further design a generative training objective to match +the evaluation objective. We name our model trained and evaluated from the +novel procedures as Information Gain (IG) captioner. We pretrain the models on +the public Laion-5B dataset and perform a series of discriminative evaluations. +For the zero-shot classification on ImageNet, IG captioner achieves $> 18\%$ +improvements over the standard captioner, achieving comparable performances +with the CLIP classifier. IG captioner also demonstrated strong performance on +zero-shot image-text retrieval tasks on MSCOCO and Flickr30K. We hope this +paper inspires further research towards unifying generative and discriminative +training procedures for visual-language models. + +
+
+
+
+
+ + ♻ ☆ A Closer Look at Audio-Visual Segmentation + + +
+ Audio-visual segmentation (AVS) is a complex task that involves accurately +segmenting the corresponding sounding object based on audio-visual queries. +Successful audio-visual learning requires two essential components: 1) an +unbiased dataset with high-quality pixel-level multi-class labels, and 2) a +model capable of effectively linking audio information with its corresponding +visual object. However, these two requirements are only partially addressed by +current methods, with training sets containing biased audio-visual data, and +models that generalise poorly beyond this biased training set. In this work, we +propose a new strategy to build cost-effective and relatively unbiased +audio-visual semantic segmentation benchmarks. Our strategy, called Visual +Post-production (VPO), explores the observation that it is not necessary to +have explicit audio-visual pairs extracted from single video sources to build +such benchmarks. We also refine the previously proposed AVSBench to transform +it into the audio-visual semantic segmentation benchmark AVSBench-Single+. +Furthermore, this paper introduces a new pixel-wise audio-visual contrastive +learning method to enable a better generalisation of the model beyond the +training set. We verify the validity of the VPO strategy by showing that +state-of-the-art (SOTA) models trained with datasets built by matching audio +and visual data from different sources or with datasets containing audio and +visual data from the same video source produce almost the same accuracy. Then, +using the proposed VPO benchmarks and AVSBench-Single+, we show that our method +produces more accurate audio-visual semantic segmentation than SOTA models. +Code and dataset will be available. + +
+
+
+
+
+ + ♻ ☆ HierSpeech++: Bridging the Gap between Semantic and Acoustic + Representation of Speech by Hierarchical Variational Inference for Zero-shot + Speech Synthesis + + +
+ Large language models (LLM)-based speech synthesis has been widely adopted in +zero-shot speech synthesis. However, they require a large-scale data and +possess the same limitations as previous autoregressive speech models, +including slow inference speed and lack of robustness. This paper proposes +HierSpeech++, a fast and strong zero-shot speech synthesizer for text-to-speech +(TTS) and voice conversion (VC). We verified that hierarchical speech synthesis +frameworks could significantly improve the robustness and expressiveness of the +synthetic speech. Furthermore, we significantly improve the naturalness and +speaker similarity of synthetic speech even in zero-shot speech synthesis +scenarios. For text-to-speech, we adopt the text-to-vec framework, which +generates a self-supervised speech representation and an F0 representation +based on text representations and prosody prompts. Then, HierSpeech++ generates +speech from the generated vector, F0, and voice prompt. We further introduce a +high-efficient speech super-resolution framework from 16 kHz to 48 kHz. The +experimental results demonstrated that the hierarchical variational autoencoder +could be a strong zero-shot speech synthesizer given that it outperforms +LLM-based and diffusion-based models. Moreover, we achieved the first +human-level quality zero-shot speech synthesis. Audio samples and source code +are available at https://github.com/sh-lee-prml/HierSpeechpp. + +
+
+ comment: 16 pages, 9 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ SLMIA-SR: Speaker-Level Membership Inference Attacks against Speaker + Recognition Systems NDSS + + +
+ Membership inference attacks allow adversaries to determine whether a +particular example was contained in the model's training dataset. While +previous works have confirmed the feasibility of such attacks in various +applications, none has focused on speaker recognition (SR), a promising +voice-based biometric recognition technique. In this work, we propose SLMIA-SR, +the first membership inference attack tailored to SR. In contrast to +conventional example-level attack, our attack features speaker-level membership +inference, i.e., determining if any voices of a given speaker, either the same +as or different from the given inference voices, have been involved in the +training of a model. It is particularly useful and practical since the training +and inference voices are usually distinct, and it is also meaningful +considering the open-set nature of SR, namely, the recognition speakers were +often not present in the training data. We utilize intra-similarity and +inter-dissimilarity, two training objectives of SR, to characterize the +differences between training and non-training speakers and quantify them with +two groups of features driven by carefully-established feature engineering to +mount the attack. To improve the generalizability of our attack, we propose a +novel mixing ratio training strategy to train attack models. To enhance the +attack performance, we introduce voice chunk splitting to cope with the limited +number of inference voices and propose to train attack models dependent on the +number of inference voices. Our attack is versatile and can work in both +white-box and black-box scenarios. Additionally, we propose two novel +techniques to reduce the number of black-box queries while maintaining the +attack performance. Extensive experiments demonstrate the effectiveness of +SLMIA-SR. + +
+
+ comment: In Proceedings of the 31st Network and Distributed System Security + (NDSS) Symposium, 2024 +
+
+
+
+
+ + ♻ ☆ Archiving Body Movements: Collective Generation of Chinese Calligraphy + + +
+ As a communication channel, body movements have been widely explored in +behavioral studies and kinesics. Performing and visual arts share the same +interests but focus on documenting and representing human body movements, such +as for dance notation and visual work creation. This paper investigates body +movements in oriental calligraphy and how to apply calligraphy principles to +stimulate and archive body movements. Through an artwork (Wushu), the authors +experiment with an interactive and generative approach to engage the audience's +bodily participation and archive the body movements as a compendium of +generated calligraphy. The audience assumes the role of both writers and +readers; creating ("writing") and appreciating ("reading") the generated +calligraphy becomes a cyclical process within this infinite "Book," which can +motivate further attention and discussions concerning Chinese characters and +calligraphy. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 22 + +
+
+
+ + ☆ Uncertainty-aware Language Modeling for Selective Question Answering + + +
+ We present an automatic large language model (LLM) conversion approach that +produces uncertainty-aware LLMs capable of estimating uncertainty with every +prediction. Our approach is model- and data-agnostic, is +computationally-efficient, and does not rely on external models or systems. We +evaluate converted models on the selective question answering setting -- to +answer as many questions as possible while maintaining a given accuracy, +forgoing providing predictions when necessary. As part of our results, we test +BERT and Llama 2 model variants on the SQuAD extractive QA task and the +TruthfulQA generative QA task. We show that using the uncertainty estimates +provided by our approach to selectively answer questions leads to significantly +higher accuracy over directly using model probabilities. + +
+
+
+
+
+ + ☆ Learning to Skip for Language Modeling + + +
+ Overparameterized large-scale language models have impressive generalization +performance of in-context few-shot learning. However, most language models +allocate the same amount of parameters or computation to each token, +disregarding the complexity or importance of the input data. We argue that in +language model pretraining, a variable amount of computation should be assigned +to different tokens, and this can be efficiently achieved via a simple routing +mechanism. Different from conventional early stopping techniques where tokens +can early exit at only early layers, we propose a more general method that +dynamically skips the execution of a layer (or module) for any input token with +a binary router. In our extensive evaluation across 24 NLP tasks, we +demonstrate that the proposed method can significantly improve the 1-shot +performance compared to other competitive baselines only at mild extra cost for +inference. + +
+
+
+
+
+ + ☆ Machine-Generated Text Detection using Deep Learning + + +
+ Our research focuses on the crucial challenge of discerning text produced by +Large Language Models (LLMs) from human-generated text, which holds +significance for various applications. With ongoing discussions about attaining +a model with such functionality, we present supporting evidence regarding the +feasibility of such models. We evaluated our models on multiple datasets, +including Twitter Sentiment, Football Commentary, Project Gutenberg, PubMedQA, +and SQuAD, confirming the efficacy of the enhanced detection approaches. These +datasets were sampled with intricate constraints encompassing every +possibility, laying the foundation for future research. We evaluate +GPT-3.5-Turbo against various detectors such as SVM, RoBERTa-base, and +RoBERTa-large. Based on the research findings, the results predominantly relied +on the sequence length of the sentence. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Learning Section Weights for Multi-Label Document Classification + + +
+ Multi-label document classification is a traditional task in NLP. Compared to +single-label classification, each document can be assigned multiple classes. +This problem is crucially important in various domains, such as tagging +scientific articles. Documents are often structured into several sections such +as abstract and title. Current approaches treat different sections equally for +multi-label classification. We argue that this is not a realistic assumption, +leading to sub-optimal results. Instead, we propose a new method called +Learning Section Weights (LSW), leveraging the contribution of each distinct +section for multi-label classification. Via multiple feed-forward layers, LSW +learns to assign weights to each section of, and incorporate the weights in the +prediction. We demonstrate our approach on scientific articles. Experimental +results on public (arXiv) and private (Elsevier) datasets confirm the +superiority of LSW, compared to state-of-the-art multi-label document +classification methods. In particular, LSW achieves a 1.3% improvement in terms +of macro averaged F1-score while it achieves 1.3% in terms of macro averaged +recall on the publicly available arXiv dataset. + +
+
+ comment: 7 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Enhancing Empathetic and Emotion Support Dialogue Generation with + Prophetic Commonsense Inference + + +
+ The interest in Empathetic and Emotional Support conversations among the +public has significantly increased. To offer more sensitive and understanding +responses, leveraging commonsense knowledge has become a common strategy to +better understand psychological aspects and causality. However, such +commonsense inferences can be out of context and unable to predict upcoming +dialogue themes, resulting in responses that lack coherence and empathy. To +remedy this issue, we present Prophetic Commonsense Inference, an innovative +paradigm for inferring commonsense knowledge. By harnessing the capabilities of +Large Language Models in understanding dialogue and making commonsense +deductions, we train tunable models to bridge the gap between past and +potential future dialogues. Extensive experiments conducted on +EmpatheticDialogues and Emotion Support Conversation show that equipping +dialogue agents with our proposed prophetic commonsense inference significantly +enhances the quality of their responses. + +
+
+
+
+
+ + ☆ UHGEval: Benchmarking the Hallucination of Chinese Large Language Models + via Unconstrained Generation ICDE2024 + + +
+ Large language models (LLMs) have emerged as pivotal contributors in +contemporary natural language processing and are increasingly being applied +across a diverse range of industries. However, these large-scale probabilistic +statistical models cannot currently ensure the requisite quality in +professional content generation. These models often produce hallucinated text, +compromising their practical utility in professional contexts. To assess the +authentic reliability of LLMs in text generation, numerous initiatives have +developed benchmark evaluations for hallucination phenomena. Nevertheless, +these benchmarks frequently utilize constrained generation techniques due to +cost and temporal constraints. These techniques encompass the use of directed +hallucination induction and strategies that deliberately alter authentic text +to produce hallucinations. These approaches are not congruent with the +unrestricted text generation demanded by real-world applications. Furthermore, +a well-established Chinese-language dataset dedicated to the evaluation of +hallucinations in text generation is presently lacking. Consequently, we have +developed an Unconstrained Hallucination Generation Evaluation (UHGEval) +benchmark, designed to compile outputs produced with minimal restrictions by +LLMs. Concurrently, we have established a comprehensive benchmark evaluation +framework to aid subsequent researchers in undertaking scalable and +reproducible experiments. We have also executed extensive experiments, +evaluating prominent Chinese language models and the GPT series models to +derive professional performance insights regarding hallucination challenges. + +
+
+ comment: 13 Pages, submitted to ICDE2024 +
+
+
+
+
+ + ☆ Dataset for Stock Market Forecasting Based on Quantitative Analysis and + Qualitative Data + + +
+ The application of Machine learning to finance has become a familiar +approach, even more so in stock market forecasting. The stock market is highly +volatile and huge amounts of data are generated every minute globally. The +extraction of effective intelligence from this data is of critical importance. +However, a collaboration of numerical stock data with qualitative text data can +be a challenging task. In this work, we accomplish this and provide an +unprecedented, publicly available dataset with technical and fundamental data, +sentiment that we gathered from News Archives, TV news captions, Radio +Transcripts, Tweets, Daily financial newspapers, etc. The text data entries +used for sentiment extraction total more than 1.4 Million. The dataset +comprises of daily entries from January 2018 to December 2022 for 8 different +companies and Dow Jones Index as a whole. Holistic Fundamental and Technical +data is provided training ready for Model learning and deployment. The +predictive power of deep learning models is highly determined by the training +data provided. This dataset would be of benefit for research globally +incorporating qualitative intelligence for stock market forecasting. The +dataset is made available at https://github.com/batking24/Huge-Stock-Dataset. + +
+
+
+
+
+ + ☆ Probabilistic Transformer: A Probabilistic Dependency Model for + Contextual Word Representation ACL2023 + + +
+ Syntactic structures used to play a vital role in natural language processing +(NLP), but since the deep learning revolution, NLP has been gradually dominated +by neural models that do not consider syntactic structures in their design. One +vastly successful class of neural models is transformers. When used as an +encoder, a transformer produces contextual representation of words in the input +sentence. In this work, we propose a new model of contextual word +representation, not from a neural perspective, but from a purely syntactic and +probabilistic perspective. Specifically, we design a conditional random field +that models discrete latent representations of all words in a sentence as well +as dependency arcs between them; and we use mean field variational inference +for approximate inference. Strikingly, we find that the computation graph of +our model resembles transformers, with correspondences between dependencies and +self-attention and between distributions over latent representations and +contextual embeddings of words. Experiments show that our model performs +competitively to transformers on small to medium sized datasets. We hope that +our work could help bridge the gap between traditional syntactic and +probabilistic approaches and cutting-edge neural approaches to NLP, and inspire +more linguistically-principled neural approaches in the future. + +
+
+ comment: Accepted to ACL2023 Findings +
+
+
+
+
+ + ☆ LongStory: Coherent, Complete and Length Controlled Long story + Generation + + +
+ A human author can write any length of story without losing coherence. Also, +they always bring the story to a proper ending, an ability that current +language models lack. In this work, we present the LongStory for coherent, +complete, and length-controlled long story generation. LongStory introduces two +novel methodologies: (1) the long and short-term contexts weight calibrator +(CWC) and (2) long story structural positions (LSP). The CWC adjusts weights +for long-term context Memory and short-term context Cheating, acknowledging +their distinct roles. The LSP employs discourse tokens to convey the structural +positions of a long story. Trained on three datasets with varied average story +lengths, LongStory outperforms other baselines, including the strong story +generator Plotmachine, in coherence, completeness, relevance, and +repetitiveness. We also perform zero-shot tests on each dataset to assess the +model's ability to predict outcomes beyond its training data and validate our +methodology by comparing its performance with variants of our model. + +
+
+
+
+
+ + ☆ ChatGPT and Beyond: The Generative AI Revolution in Education + + +
+ The wide adoption and usage of generative artificial intelligence (AI) +models, particularly ChatGPT, has sparked a surge in research exploring their +potential applications in the educational landscape. This survey examines +academic literature published between November, 2022, and July, 2023, +specifically targeting high-impact research from Scopus-indexed Q1 and Q2 +journals. This survey delves into the practical applications and implications +of generative AI models across a diverse range of educational contexts. Through +a comprehensive and rigorous evaluation of recent academic literature, this +survey seeks to illuminate the evolving role of generative AI models, +particularly ChatGPT, in education. By shedding light on the potential +benefits, challenges, and emerging trends in this dynamic field, the survey +endeavors to contribute to the understanding of the nexus between artificial +intelligence and education. The findings of this review will empower educators, +researchers, and policymakers to make informed decisions about the integration +of AI technologies into learning environments. + +
+
+
+
+
+ + ☆ Benchmarking Large Language Model Volatility + + +
+ The impact of non-deterministic outputs from Large Language Models (LLMs) is +not well examined for financial text understanding tasks. Through a compelling +case study on investing in the US equity market via news sentiment analysis, we +uncover substantial variability in sentence-level sentiment classification +results, underscoring the innate volatility of LLM outputs. These uncertainties +cascade downstream, leading to more significant variations in portfolio +construction and return. While tweaking the temperature parameter in the +language model decoder presents a potential remedy, it comes at the expense of +stifled creativity. Similarly, while ensembling multiple outputs mitigates the +effect of volatile outputs, it demands a notable computational investment. This +work furnishes practitioners with invaluable insights for adeptly navigating +uncertainty in the integration of LLMs into financial decision-making, +particularly in scenarios dictated by non-deterministic information. + +
+
+ comment: 7 pages, 2 figures, Workshop on AI Safety and Robustness In Finance, + ICAIF 2023 +
+
+
+
+
+ + ♻ ☆ Conditional Adapters: Parameter-efficient Transfer Learning with Fast + Inference NeurIPS + + +
+ We propose Conditional Adapter (CoDA), a parameter-efficient transfer +learning method that also improves inference efficiency. CoDA generalizes +beyond standard adapter approaches to enable a new way of balancing speed and +accuracy using conditional computation. Starting with an existing dense +pretrained model, CoDA adds sparse activation together with a small number of +new parameters and a light-weight training phase. Our experiments demonstrate +that the CoDA approach provides an unexpectedly efficient way to transfer +knowledge. Across a variety of language, vision, and speech tasks, CoDA +achieves a 2x to 8x inference speed-up compared to the state-of-the-art Adapter +approaches with moderate to no accuracy loss and the same parameter efficiency. + +
+
+ comment: NeurIPS camera ready version +
+
+
+
+
+ + ♻ ☆ Sequential Monte Carlo Steering of Large Language Models using + Probabilistic Programs + + +
+ Even after fine-tuning and reinforcement learning, large language models +(LLMs) can be difficult, if not impossible, to control reliably with prompts +alone. We propose a new inference-time approach to enforcing syntactic and +semantic constraints on the outputs of LLMs, called sequential Monte Carlo +(SMC) steering. The key idea is to specify language generation tasks as +posterior inference problems in a class of discrete probabilistic sequence +models, and replace standard decoding with sequential Monte Carlo inference. +For a computational cost similar to that of beam search, SMC can steer LLMs to +solve diverse tasks, including infilling, generation under syntactic +constraints, and prompt intersection. To facilitate experimentation with SMC +steering, we present a probabilistic programming library, LLaMPPL +(https://github.com/probcomp/hfppl), for concisely specifying new generation +tasks as language model probabilistic programs, and automating steering of +LLaMA-family Transformers. + +
+
+ comment: Minor typo fixes +
+
+
+
+
+ + ♻ ☆ In-Context Impersonation Reveals Large Language Models' Strengths and + Biases NeurIPS 2023 + + +
+ In everyday conversations, humans can take on different roles and adapt their +vocabulary to their chosen roles. We explore whether LLMs can take on, that is +impersonate, different roles when they generate text in-context. We ask LLMs to +assume different personas before solving vision and language tasks. We do this +by prefixing the prompt with a persona that is associated either with a social +identity or domain expertise. In a multi-armed bandit task, we find that LLMs +pretending to be children of different ages recover human-like developmental +stages of exploration. In a language-based reasoning task, we find that LLMs +impersonating domain experts perform better than LLMs impersonating non-domain +experts. Finally, we test whether LLMs' impersonations are complementary to +visual information when describing different categories. We find that +impersonation can improve performance: an LLM prompted to be a bird expert +describes birds better than one prompted to be a car expert. However, +impersonation can also uncover LLMs' biases: an LLM prompted to be a man +describes cars better than one prompted to be a woman. These findings +demonstrate that LLMs are capable of taking on diverse roles and that this +in-context impersonation can be used to uncover their hidden strengths and +biases. + +
+
+ comment: Published in NeurIPS 2023 (Spotlight) +
+
+
+
+
+ + ♻ ☆ InferEM: Inferring the Speaker's Intention for Empathetic Dialogue + Generation + + +
+ Current approaches to empathetic response generation typically encode the +entire dialogue history directly and put the output into a decoder to generate +friendly feedback. These methods focus on modelling contextual information but +neglect capturing the direct intention of the speaker. We argue that the last +utterance in the dialogue empirically conveys the intention of the speaker. +Consequently, we propose a novel model named InferEM for empathetic response +generation. We separately encode the last utterance and fuse it with the entire +dialogue through the multi-head attention based intention fusion module to +capture the speaker's intention. Besides, we utilize previous utterances to +predict the last utterance, which simulates human's psychology to guess what +the interlocutor may speak in advance. To balance the optimizing rates of the +utterance prediction and response generation, a multi-task learning strategy is +designed for InferEM. Experimental results demonstrate the plausibility and +validity of InferEM in improving empathetic expression. + +
+
+ comment: Accepted by the 45th Annual Meeting of the Cognitive Science Society + (CogSci 2023) +
+
+
+
+
+ + ♻ ☆ Mirror: A Universal Framework for Various Information Extraction Tasks EMNLP23 + + +
+ Sharing knowledge between information extraction tasks has always been a +challenge due to the diverse data formats and task variations. Meanwhile, this +divergence leads to information waste and increases difficulties in building +complex applications in real scenarios. Recent studies often formulate IE tasks +as a triplet extraction problem. However, such a paradigm does not support +multi-span and n-ary extraction, leading to weak versatility. To this end, we +reorganize IE problems into unified multi-slot tuples and propose a universal +framework for various IE tasks, namely Mirror. Specifically, we recast existing +IE tasks as a multi-span cyclic graph extraction problem and devise a +non-autoregressive graph decoding algorithm to extract all spans in a single +step. It is worth noting that this graph structure is incredibly versatile, and +it supports not only complex IE tasks, but also machine reading comprehension +and classification tasks. We manually construct a corpus containing 57 datasets +for model pretraining, and conduct experiments on 30 datasets across 8 +downstream tasks. The experimental results demonstrate that our model has +decent compatibility and outperforms or reaches competitive performance with +SOTA systems under few-shot and zero-shot settings. The code, model weights, +and pretraining corpus are available at https://github.com/Spico197/Mirror . + +
+
+ comment: Accepted to EMNLP23 main conference +
+
+
+
+
+ + ♻ ☆ AI-Augmented Surveys: Leveraging Large Language Models and Surveys for + Opinion Prediction + + +
+ Large language models (LLMs) that produce human-like responses have begun to +revolutionize research practices in the social sciences. This paper shows how +we can integrate LLMs and social surveys to accurately predict individual +responses to survey questions that were not asked before. We develop a novel +methodological framework to personalize LLMs by considering the meaning of +survey questions derived from their text, the latent beliefs of individuals +inferred from their response patterns, and the temporal contexts across +different survey periods through fine-tuning LLMs with survey data. Using the +General Social Survey from 1972 to 2021, we show that the fine-tuned model +based on Alpaca-7b can predict individual responses to survey questions that +are partially missing as well as entirely missing. The remarkable prediction +capabilities allow us to fill in missing trends with high confidence and +pinpoint when public attitudes changed, such as the rising support for same-sex +marriage. We discuss practical constraints, socio-demographic representation, +and ethical concerns regarding individual autonomy and privacy when using LLMs +for opinion prediction. This study demonstrates that LLMs and surveys can +mutually enhance each other's capabilities: LLMs broaden survey potential, +while surveys improve the alignment of LLMs. + +
+
+
+
+
+ + ♻ ☆ Unveiling Public Perceptions: Machine Learning-Based Sentiment Analysis + of COVID-19 Vaccines in India + + +
+ In March 2020, the World Health Organisation declared COVID-19 a global +pandemic as it spread to nearly every country. By mid-2021, India had +introduced three vaccines: Covishield, Covaxin, and Sputnik. To ensure +successful vaccination in a densely populated country like India, understanding +public sentiment was crucial. Social media, particularly Reddit with over 430 +million users, played a vital role in disseminating information. This study +employs data mining techniques to analyze Reddit data and gauge Indian +sentiments towards COVID-19 vaccines. Using Python's Text Blob library, +comments are annotated to assess general sentiments. Results show that most +Reddit users in India expressed neutrality about vaccination, posing a +challenge for the Indian government's efforts to vaccinate a significant +portion of the population. + +
+
+
+
+
+ + ♻ ☆ Multimodal Document Analytics for Banking Process Automation + + +
+ Traditional banks face increasing competition from FinTechs in the rapidly +evolving financial ecosystem. Raising operational efficiency is vital to +address this challenge. Our study aims to improve the efficiency of +document-intensive business processes in banking. To that end, we first review +the landscape of business documents in the retail segment. Banking documents +often contain text, layout, and visuals, suggesting that document analytics and +process automation require more than plain natural language processing (NLP). +To verify this and assess the incremental value of visual cues when processing +business documents, we compare a recently proposed multimodal model called +LayoutXLM to powerful text classifiers (e.g., BERT) and large language models +(e.g., GPT) in a case study related to processing company register extracts. +The results confirm that incorporating layout information in a model +substantially increases its performance. Interestingly, we also observed that +more than 75% of the best model performance (in terms of the F1 score) can be +achieved with as little as 30% of the training data. This shows that the demand +for data labeled data to set up a multi-modal model can be moderate, which +simplifies real-world applications of multimodal document analytics. Our study +also sheds light on more specific practices in the scope of calibrating a +multimodal banking document classifier, including the need for fine-tuning. In +sum, the paper contributes original empirical evidence on the effectiveness and +efficiency of multi-model models for document processing in the banking +business and offers practical guidance on how to unlock this potential in +day-to-day operations. + +
+
+ comment: A Preprint +
+
+
+
+
+ + ♻ ☆ DocAsRef: An Empirical Study on Repurposing Reference-Based Summary + Quality Metrics Reference-Freely EMNLP 2023 + + +
+ Automated summary quality assessment falls into two categories: +reference-based and reference-free. Reference-based metrics, historically +deemed more accurate due to the additional information provided by +human-written references, are limited by their reliance on human input. In this +paper, we hypothesize that the comparison methodologies used by some +reference-based metrics to evaluate a system summary against its corresponding +reference can be effectively adapted to assess it against its source document, +thereby transforming these metrics into reference-free ones. Experimental +results support this hypothesis. After being repurposed reference-freely, the +zero-shot BERTScore using the pretrained DeBERTa-large-MNLI model of <0.5B +parameters consistently outperforms its original reference-based version across +various aspects on the SummEval and Newsroom datasets. It also excels in +comparison to most existing reference-free metrics and closely competes with +zero-shot summary evaluators based on GPT-3.5. + +
+
+ comment: Accepted into Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Autoregressive Language Models For Estimating the Entropy of Epic EHR + Audit Logs ML4H + + +
+ EHR audit logs are a highly granular stream of events that capture clinician +activities, and is a significant area of interest for research in +characterizing clinician workflow on the electronic health record (EHR). +Existing techniques to measure the complexity of workflow through EHR audit +logs (audit logs) involve time- or frequency-based cross-sectional aggregations +that are unable to capture the full complexity of a EHR session. We briefly +evaluate the usage of transformer-based tabular language model (tabular LM) in +measuring the entropy or disorderedness of action sequences within workflow and +release the evaluated models publicly. + +
+
+ comment: Extended Abstract presented at Machine Learning for Health (ML4H) + symposium 2023, December 10th, 2023, New Orleans, United States, 10 pages +
+
+
+
+
+ + ♻ ☆ PlanBench: An Extensible Benchmark for Evaluating Large Language Models + on Planning and Reasoning about Change NeurIPS 2023 + + +
+ Generating plans of action, and reasoning about change have long been +considered a core competence of intelligent agents. It is thus no surprise that +evaluating the planning and reasoning capabilities of large language models +(LLMs) has become a hot topic of research. Most claims about LLM planning +capabilities are however based on common sense tasks-where it becomes hard to +tell whether LLMs are planning or merely retrieving from their vast world +knowledge. There is a strong need for systematic and extensible planning +benchmarks with sufficient diversity to evaluate whether LLMs have innate +planning capabilities. Motivated by this, we propose PlanBench, an extensible +benchmark suite based on the kinds of domains used in the automated planning +community, especially in the International Planning Competition, to test the +capabilities of LLMs in planning or reasoning about actions and change. +PlanBench provides sufficient diversity in both the task domains and the +specific planning capabilities. Our studies also show that on many critical +capabilities-including plan generation-LLM performance falls quite short, even +with the SOTA models. PlanBench can thus function as a useful marker of +progress of LLMs in planning and reasoning. + +
+
+ comment: NeurIPS 2023 Track on Datasets and Benchmarks +
+
+
+
+
+
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ Data Augmentation for Sample Efficient and Robust Document Ranking + + +
+ Contextual ranking models have delivered impressive performance improvements +over classical models in the document ranking task. However, these highly +over-parameterized models tend to be data-hungry and require large amounts of +data even for fine-tuning. In this paper, we propose data-augmentation methods +for effective and robust ranking performance. One of the key benefits of using +data augmentation is in achieving sample efficiency or learning effectively +when we have only a small amount of training data. We propose supervised and +unsupervised data augmentation schemes by creating training data using parts of +the relevant documents in the query-document pairs. We then adapt a family of +contrastive losses for the document ranking task that can exploit the augmented +data to learn an effective ranking model. Our extensive experiments on subsets +of the MS MARCO and TREC-DL test sets show that data augmentation, along with +the ranking-adapted contrastive losses, results in performance improvements +under most dataset sizes. Apart from sample efficiency, we conclusively show +that data augmentation results in robust models when transferred to +out-of-domain benchmarks. Our performance improvements in in-domain and more +prominently in out-of-domain benchmarks show that augmentation regularizes the +ranking model and improves its robustness and generalization capability. + +
+
+
+
+
+ + ☆ Query-LIFE: Query-aware Language Image Fusion Embedding for E-Commerce + Relevance + + +
+ Relevance module plays a fundamental role in e-commerce search as they are +responsible for selecting relevant products from thousands of items based on +user queries, thereby enhancing users experience and efficiency. The +traditional approach models the relevance based product titles and queries, but +the information in titles alone maybe insufficient to describe the products +completely. A more general optimization approach is to further leverage product +image information. In recent years, vision-language pre-training models have +achieved impressive results in many scenarios, which leverage contrastive +learning to map both textual and visual features into a joint embedding space. +In e-commerce, a common practice is to fine-tune on the pre-trained model based +on e-commerce data. However, the performance is sub-optimal because the +vision-language pre-training models lack of alignment specifically designed for +queries. In this paper, we propose a method called Query-LIFE (Query-aware +Language Image Fusion Embedding) to address these challenges. Query-LIFE +utilizes a query-based multimodal fusion to effectively incorporate the image +and title based on the product types. Additionally, it employs query-aware +modal alignment to enhance the accuracy of the comprehensive representation of +products. Furthermore, we design GenFilt, which utilizes the generation +capability of large models to filter out false negative samples and further +improve the overall performance of the contrastive learning task in the model. +Experiments have demonstrated that Query-LIFE outperforms existing baselines. +We have conducted ablation studies and human evaluations to validate the +effectiveness of each module within Query-LIFE. Moreover, Query-LIFE has been +deployed on Miravia Search, resulting in improved both relevance and conversion +efficiency. + +
+
+
+
+
+ + ♻ ☆ Typos-aware Bottlenecked Pre-Training for Robust Dense Retrieval SIGIR + + +
+ Current dense retrievers (DRs) are limited in their ability to effectively +process misspelled queries, which constitute a significant portion of query +traffic in commercial search engines. The main issue is that the pre-trained +language model-based encoders used by DRs are typically trained and fine-tuned +using clean, well-curated text data. Misspelled queries are typically not found +in the data used for training these models, and thus misspelled queries +observed at inference time are out-of-distribution compared to the data used +for training and fine-tuning. Previous efforts to address this issue have +focused on \textit{fine-tuning} strategies, but their effectiveness on +misspelled queries remains lower than that of pipelines that employ separate +state-of-the-art spell-checking components. To address this challenge, we +propose ToRoDer (TypOs-aware bottlenecked pre-training for RObust DEnse +Retrieval), a novel re-training strategy for DRs that increases their +robustness to misspelled queries while preserving their effectiveness in +downstream retrieval tasks. ToRoDer utilizes an encoder-decoder architecture +where the encoder takes misspelled text with masked tokens as input and outputs +bottlenecked information to the decoder. The decoder then takes as input the +bottlenecked embeddings, along with token embeddings of the original text with +the misspelled tokens masked out. The pre-training task is to recover the +masked tokens for both the encoder and decoder. Our extensive experimental +results and detailed ablation studies show that DRs pre-trained with ToRoDer +exhibit significantly higher effectiveness on misspelled queries, sensibly +closing the gap with pipelines that use a separate, complex spell-checker +component, while retaining their effectiveness on correctly spelled queries. + +
+
+ comment: 10 pages, accepted at SIGIR-AP +
+
+
+
+
+ + ♻ ☆ FLIP: Towards Fine-grained Alignment between ID-based Models and + Pretrained Language Models for CTR Prediction + + +
+ Click-through rate (CTR) prediction plays as a core function module in +various personalized online services. The traditional ID-based models for CTR +prediction take as inputs the one-hot encoded ID features of tabular modality, +which capture the collaborative signals via feature interaction modeling. But +the one-hot encoding discards the semantic information conceived in the +original feature texts. Recently, the emergence of Pretrained Language Models +(PLMs) has given rise to another paradigm, which takes as inputs the sentences +of textual modality obtained by hard prompt templates and adopts PLMs to +extract the semantic knowledge. However, PLMs generally tokenize the input text +data into subword tokens and ignore field-wise collaborative signals. +Therefore, these two lines of research focus on different characteristics of +the same input data (i.e., textual and tabular modalities), forming a distinct +complementary relationship with each other. In this paper, we propose to +conduct Fine-grained feature-level ALignment between ID-based Models and +Pretrained Language Models (FLIP) for CTR prediction. We design a novel joint +reconstruction pretraining task for both masked language and tabular modeling. +Specifically, the masked data of one modality (i.e., tokens or features) has to +be recovered with the help of the other modality, which establishes the +feature-level interaction and alignment via sufficient mutual information +extraction between dual modalities. Moreover, we propose to jointly finetune +the ID-based model and PLM for downstream CTR prediction tasks, thus achieving +superior performance by combining the advantages of both models. Extensive +experiments on three real-world datasets demonstrate that FLIP outperforms SOTA +baselines, and is highly compatible for various ID-based models and PLMs. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ A Topology-aware Analysis of Graph Collaborative Filtering + + +
+ The successful integration of graph neural networks into recommender systems +(RSs) has led to a novel paradigm in collaborative filtering (CF), graph +collaborative filtering (graph CF). By representing user-item data as an +undirected, bipartite graph, graph CF utilizes short- and long-range +connections to extract collaborative signals that yield more accurate user +preferences than traditional CF methods. Although the recent literature +highlights the efficacy of various algorithmic strategies in graph CF, the +impact of datasets and their topological features on recommendation performance +is yet to be studied. To fill this gap, we propose a topology-aware analysis of +graph CF. In this study, we (i) take some widely-adopted recommendation +datasets and use them to generate a large set of synthetic sub-datasets through +two state-of-the-art graph sampling methods, (ii) measure eleven of their +classical and topological characteristics, and (iii) estimate the accuracy +calculated on the generated sub-datasets considering four popular and recent +graph-based RSs (i.e., LightGCN, DGCF, UltraGCN, and SVD-GCN). Finally, the +investigation presents an explanatory framework that reveals the linear +relationships between characteristics and accuracy measures. The results, +statistically validated under different graph sampling settings, confirm the +existence of solid dependencies between topological characteristics and +accuracy in the graph-based recommendation, offering a new perspective on how +to interpret graph CF. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Image Outlier Detection using RANSAC + + +
+ Image outlier detection (OD) is an essential tool to ensure the quality and +accuracy of image datasets used in computer vision tasks. Most existing +approaches, however, require a set of in-distribution data for training prior +to outlier prediction. The quality and quantity of the data can influence the +resulting performance. Thus, selecting a suitable in-distribution set often +requires considerable effort. In this work, we propose RANSAC-NN, an +unsupervised image OD algorithm designed to detect outliers within contaminated +sets in a one-class classification fashion. Without any training, RANSAC-NN +performs favorably in comparison to other well-established methods in a variety +of OD benchmarks. Furthermore, we show that our method can enhance the +robustness of existing OD methods by simply applying RANSAC-NN during +pre-processing. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ GAIA: Zero-shot Talking Avatar Generation + + +
+ Zero-shot talking avatar generation aims at synthesizing natural talking +videos from speech and a single portrait image. Previous methods have relied on +domain-specific heuristics such as warping-based motion representation and 3D +Morphable Models, which limit the naturalness and diversity of the generated +avatars. In this work, we introduce GAIA (Generative AI for Avatar), which +eliminates the domain priors in talking avatar generation. In light of the +observation that the speech only drives the motion of the avatar while the +appearance of the avatar and the background typically remain the same +throughout the entire video, we divide our approach into two stages: 1) +disentangling each frame into motion and appearance representations; 2) +generating motion sequences conditioned on the speech and reference portrait +image. We collect a large-scale high-quality talking avatar dataset and train +the model on it with different scales (up to 2B parameters). Experimental +results verify the superiority, scalability, and flexibility of GAIA as 1) the +resulting model beats previous baseline models in terms of naturalness, +diversity, lip-sync quality, and visual quality; 2) the framework is scalable +since larger models yield better results; 3) it is general and enables +different applications like controllable talking avatar generation and +text-instructed avatar generation. + +
+
+ comment: Project page: https://microsoft.github.io/GAIA/ +
+
+
+
+
+ + ♻ ☆ Adding Conditional Control to Text-to-Image Diffusion Models + + +
+ We present ControlNet, a neural network architecture to add spatial +conditioning controls to large, pretrained text-to-image diffusion models. +ControlNet locks the production-ready large diffusion models, and reuses their +deep and robust encoding layers pretrained with billions of images as a strong +backbone to learn a diverse set of conditional controls. The neural +architecture is connected with "zero convolutions" (zero-initialized +convolution layers) that progressively grow the parameters from zero and ensure +that no harmful noise could affect the finetuning. We test various conditioning +controls, eg, edges, depth, segmentation, human pose, etc, with Stable +Diffusion, using single or multiple conditions, with or without prompts. We +show that the training of ControlNets is robust with small (<50k) and large +(>1m) datasets. Extensive results show that ControlNet may facilitate wider +applications to control image diffusion models. + +
+
+ comment: Codes and Supplementary Material: + https://github.com/lllyasviel/ControlNet +
+
+
+
+
+ + ♻ ☆ WavJourney: Compositional Audio Creation with Large Language Models + + +
+ Despite breakthroughs in audio generation models, their capabilities are +often confined to domain-specific conditions such as speech transcriptions and +audio captions. However, real-world audio creation aims to generate harmonious +audio containing various elements such as speech, music, and sound effects with +controllable conditions, which is challenging to address using existing audio +generation systems. We present WavJourney, a novel framework that leverages +Large Language Models (LLMs) to connect various audio models for audio +creation. WavJourney allows users to create storytelling audio content with +diverse audio elements simply from textual descriptions. Specifically, given a +text instruction, WavJourney first prompts LLMs to generate an audio script +that serves as a structured semantic representation of audio elements. The +audio script is then converted into a computer program, where each line of the +program calls a task-specific audio generation model or computational operation +function. The computer program is then executed to obtain a compositional and +interpretable solution for audio creation. Experimental results suggest that +WavJourney is capable of synthesizing realistic audio aligned with +textually-described semantic, spatial and temporal conditions, achieving +state-of-the-art results on text-to-audio generation benchmarks. Additionally, +we introduce a new multi-genre story benchmark. Subjective evaluations +demonstrate the potential of WavJourney in crafting engaging storytelling audio +content from text. We further demonstrate that WavJourney can facilitate +human-machine co-creation in multi-round dialogues. To foster future research, +the code and synthesized audio are available at: +https://audio-agi.github.io/WavJourney_demopage/. + +
+
+ comment: GitHub: https://github.com/Audio-AGI/WavJourney +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 23 + +
+
+
+ + ☆ Localizing Lying in Llama: Understanding Instructed Dishonesty on + True-False Questions Through Prompting, Probing, and Patching + + +
+ Large language models (LLMs) demonstrate significant knowledge through their +outputs, though it is often unclear whether false outputs are due to a lack of +knowledge or dishonesty. In this paper, we investigate instructed dishonesty, +wherein we explicitly prompt LLaMA-2-70b-chat to lie. We perform prompt +engineering to find which prompts best induce lying behavior, and then use +mechanistic interpretability approaches to localize where in the network this +behavior occurs. Using linear probing and activation patching, we localize five +layers that appear especially important for lying. We then find just 46 +attention heads within these layers that enable us to causally intervene such +that the lying model instead answers honestly. We show that these interventions +work robustly across many prompts and dataset splits. Overall, our work +contributes a greater understanding of dishonesty in LLMs so that we may hope +to prevent it. + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ☆ Relevance feedback strategies for recall-oriented neural information + retrieval + + +
+ In a number of information retrieval applications (e.g., patent search, +literature review, due diligence, etc.), preventing false negatives is more +important than preventing false positives. However, approaches designed to +reduce review effort (like "technology assisted review") can create false +negatives, since they are often based on active learning systems that exclude +documents automatically based on user feedback. Therefore, this research +proposes a more recall-oriented approach to reducing review effort. More +specifically, through iteratively re-ranking the relevance rankings based on +user feedback, which is also referred to as relevance feedback. In our proposed +method, the relevance rankings are produced by a BERT-based dense-vector search +and the relevance feedback is based on cumulatively summing the queried and +selected embeddings. Our results show that this method can reduce review effort +between 17.85% and 59.04%, compared to a baseline approach (of no feedback), +given a fixed recall target + +
+
+
+
+
+ + ☆ Solving the Right Problem is Key for Translational NLP: A Case Study in + UMLS Vocabulary Insertion EMNLP 2023 + + +
+ As the immense opportunities enabled by large language models become more +apparent, NLP systems will be increasingly expected to excel in real-world +settings. However, in many instances, powerful models alone will not yield +translational NLP solutions, especially if the formulated problem is not well +aligned with the real-world task. In this work, we study the case of UMLS +vocabulary insertion, an important real-world task in which hundreds of +thousands of new terms, referred to as atoms, are added to the UMLS, one of the +most comprehensive open-source biomedical knowledge bases. Previous work aimed +to develop an automated NLP system to make this time-consuming, costly, and +error-prone task more efficient. Nevertheless, practical progress in this +direction has been difficult to achieve due to a problem formulation and +evaluation gap between research output and the real-world task. In order to +address this gap, we introduce a new formulation for UMLS vocabulary insertion +which mirrors the real-world task, datasets which faithfully represent it and +several strong baselines we developed through re-purposing existing solutions. +Additionally, we propose an effective rule-enhanced biomedical language model +which enables important new model behavior, outperforms all strong baselines +and provides measurable qualitative improvements to editors who carry out the +UVI task. We hope this case study provides insight into the considerable +importance of problem formulation for the success of translational NLP +solutions. + +
+
+ comment: EMNLP 2023 Findings; Code is available at + https://github.com/OSU-NLP-Group/UMLS-Vocabulary-Insertion +
+
+
+
+
+ + ☆ Multilingual self-supervised speech representations improve the speech + recognition of low-resource African languages with codeswitching EMNLP 2023 + + +
+ While many speakers of low-resource languages regularly code-switch between +their languages and other regional languages or English, datasets of +codeswitched speech are too small to train bespoke acoustic models from scratch +or do language model rescoring. Here we propose finetuning self-supervised +speech representations such as wav2vec 2.0 XLSR to recognize code-switched +data. We find that finetuning self-supervised multilingual representations and +augmenting them with n-gram language models trained from transcripts reduces +absolute word error rates by up to 20% compared to baselines of hybrid models +trained from scratch on code-switched data. Our findings suggest that in +circumstances with limited training data finetuning self-supervised +representations is a better performing and viable solution. + +
+
+ comment: 5 pages, 1 figure. Computational Approaches to Linguistic + Code-Switching, CALCS 2023 (co-located with EMNLP 2023) +
+
+
+
+
+ + ☆ Automatically Finding and Categorizing Replication Studies + + +
+ In many fields of experimental science, papers that failed to replicate +continue to be cited as a result of the poor discoverability of replication +studies. As a first step to creating a system that automatically finds +replication studies for a given paper, 334 replication studies and 344 +replicated studies were collected. Replication studies could be identified in +the dataset based on text content at a higher rate than chance (AUROC = 0.886). + Additionally, successful replication studies could be distinguished from +failed replication studies at a higher rate than chance (AUROC = 0.664). + +
+
+
+
+
+ + ☆ Detection of developmental language disorder in Cypriot Greek children + using a machine learning neural network algorithm + + +
+ Children with developmental language disorder (DLD) encounter difficulties in +acquiring various language structures. Early identification and intervention +are crucial to prevent negative long-term outcomes impacting the academic, +social, and emotional development of children. The study aims to develop an +automated method for the identification of DLD using artificial intelligence, +specifically a neural network machine learning algorithm. This protocol is +applied for the first time in Cypriot Greek children, which is generally +considered underresearched in the context of DLD. The neural network model was +trained using perceptual and production data elicited from children with DLD +and healthy controls. The k-fold technique was used to crossvalidate the +algorithm. The performance of the model was evaluated using metrics such as +accuracy, precision, recall, F1 score, and ROC/AUC curve to assess its ability +to make accurate predictions on a set of unseen data. The results demonstrated +high classification values for all metrics (between 0.92 and 0.98), indicating +the high accuracy of the neural model in classifying children with DLD. +Additionally, the variable importance analysis revealed that the language +production skills of children had a more significant impact on the performance +of the model compared to perception skills. Neural networks represent powerful +tools for detecting DLD, providing early and quick assessments of the disorder, +and having the potential to improve clinical outcomes. + +
+
+ comment: 13 pages, 3 figures, journal article +
+
+
+
+
+ + ☆ nlpBDpatriots at BLP-2023 Task 2: A Transfer Learning Approach to Bangla + Sentiment Analysis + + +
+ In this paper, we discuss the nlpBDpatriots entry to the shared task on +Sentiment Analysis of Bangla Social Media Posts organized at the first workshop +on Bangla Language Processing (BLP) co-located with EMNLP. The main objective +of this task is to identify the polarity of social media content using a Bangla +dataset annotated with positive, neutral, and negative labels provided by the +shared task organizers. Our best system for this task is a transfer learning +approach with data augmentation which achieved a micro F1 score of 0.71. Our +best system ranked 12th among 30 teams that participated in the competition. + +
+
+
+
+
+ + ☆ nlpBDpatriots at BLP-2023 Task 1: A Two-Step Classification for Violence + Inciting Text Detection in Bangla + + +
+ In this paper, we discuss the nlpBDpatriots entry to the shared task on +Violence Inciting Text Detection (VITD) organized as part of the first workshop +on Bangla Language Processing (BLP) co-located with EMNLP. The aim of this task +is to identify and classify the violent threats, that provoke further unlawful +violent acts. Our best-performing approach for the task is two-step +classification using back translation and multilinguality which ranked 6th out +of 27 teams with a macro F1 score of 0.74. + +
+
+
+
+
+ + ☆ Offensive Language Identification in Transliterated and Code-Mixed + Bangla + + +
+ Identifying offensive content in social media is vital for creating safe +online communities. Several recent studies have addressed this problem by +creating datasets for various languages. In this paper, we explore offensive +language identification in texts with transliterations and code-mixing, +linguistic phenomena common in multilingual societies, and a known challenge +for NLP systems. We introduce TB-OLID, a transliterated Bangla offensive +language dataset containing 5,000 manually annotated comments. We train and +fine-tune machine learning models on TB-OLID, and we evaluate their results on +this dataset. Our results show that English pre-trained transformer-based +models, such as fBERT and HateBERT achieve the best performance on this +dataset. + +
+
+
+
+
+ + ☆ E-CORE: Emotion Correlation Enhanced Empathetic Dialogue Generation + + +
+ Achieving empathy is a crucial step toward humanized dialogue systems. +Current approaches for empathetic dialogue generation mainly perceive an +emotional label to generate an empathetic response conditioned on it, which +simply treat emotions independently, but ignore the intrinsic emotion +correlation in dialogues, resulting in inaccurate emotion perception and +unsuitable response generation. In this paper, we propose a novel emotion +correlation enhanced empathetic dialogue generation framework, which +comprehensively realizes emotion correlation learning, utilization, and +supervising. Specifically, a multi-resolution emotion graph is devised to +capture context-based emotion interactions from different resolutions, further +modeling emotion correlation. Then we propose an emotion correlation enhanced +decoder, with a novel correlation-aware aggregation and soft/hard strategy, +respectively improving the emotion perception and response generation. +Experimental results on the benchmark dataset demonstrate the superiority of +our model in both empathetic perception and expression. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ Walking a Tightrope -- Evaluating Large Language Models in High-Risk + Domains EMNLP 2023 + + +
+ High-risk domains pose unique challenges that require language models to +provide accurate and safe responses. Despite the great success of large +language models (LLMs), such as ChatGPT and its variants, their performance in +high-risk domains remains unclear. Our study delves into an in-depth analysis +of the performance of instruction-tuned LLMs, focusing on factual accuracy and +safety adherence. To comprehensively assess the capabilities of LLMs, we +conduct experiments on six NLP datasets including question answering and +summarization tasks within two high-risk domains: legal and medical. Further +qualitative analysis highlights the existing limitations inherent in current +LLMs when evaluating in high-risk domains. This underscores the essential +nature of not only improving LLM capabilities but also prioritizing the +refinement of domain-specific metrics, and embracing a more human-centric +approach to enhance safety and factual reliability. Our findings advance the +field toward the concerns of properly evaluating LLMs in high-risk domains, +aiming to steer the adaptability of LLMs in fulfilling societal obligations and +aligning with forthcoming regulations, such as the EU AI Act. + +
+
+ comment: EMNLP 2023 Workshop on Benchmarking Generalisation in NLP (GenBench) +
+
+
+
+
+ + ☆ Vector-Quantized Prompt Learning for Paraphrase Generation EMNLP + + +
+ Deep generative modeling of natural languages has achieved many successes, +such as producing fluent sentences and translating from one language into +another. However, the development of generative modeling techniques for +paraphrase generation still lags behind largely due to the challenges in +addressing the complex conflicts between expression diversity and semantic +preservation. This paper proposes to generate diverse and high-quality +paraphrases by exploiting the pre-trained models with instance-dependent +prompts. To learn generalizable prompts, we assume that the number of abstract +transforming patterns of paraphrase generation (governed by prompts) is finite +and usually not large. Therefore, we present vector-quantized prompts as the +cues to control the generation of pre-trained models. Extensive experiments +demonstrate that the proposed method achieves new state-of-art results on three +benchmark datasets, including Quora, Wikianswers, and MSCOCO. We will release +all the code upon acceptance. + +
+
+ comment: EMNLP Findings, 2023 +
+
+
+
+
+ + ☆ Faster Minimum Bayes Risk Decoding with Confidence-based Pruning EMNLP 2023 + + +
+ Minimum Bayes risk (MBR) decoding outputs the hypothesis with the highest +expected utility over the model distribution for some utility function. It has +been shown to improve accuracy over beam search in conditional language +generation problems and especially neural machine translation, in both human +and automatic evaluations. However, the standard sampling-based algorithm for +MBR is substantially more computationally expensive than beam search, requiring +a large number of samples as well as a quadratic number of calls to the utility +function, limiting its applicability. We describe an algorithm for MBR which +gradually grows the number of samples used to estimate the utility while +pruning hypotheses that are unlikely to have the highest utility according to +confidence estimates obtained with bootstrap sampling. Our method requires +fewer samples and drastically reduces the number of calls to the utility +function compared to standard MBR while being statistically indistinguishable +in terms of accuracy. We demonstrate the effectiveness of our approach in +experiments on three language pairs, using chrF++ and COMET as +utility/evaluation metrics. + +
+
+ comment: Updated from EMNLP 2023 version: typo fix, minor math notation + change, updated citation +
+
+
+
+
+ + ☆ Code Search Debiasing:Improve Search Results beyond Overall Ranking + Performance EMNLP 2023 + + +
+ Code search engine is an essential tool in software development. Many code +search methods have sprung up, focusing on the overall ranking performance of +code search. In this paper, we study code search from another perspective by +analyzing the bias of code search models. Biased code search engines provide +poor user experience, even though they show promising overall performance. Due +to different development conventions (e.g., prefer long queries or +abbreviations), some programmers will find the engine useful, while others may +find it hard to get desirable search results. To mitigate biases, we develop a +general debiasing framework that employs reranking to calibrate search results. +It can be easily plugged into existing engines and handle new code search +biases discovered in the future. Experiments show that our framework can +effectively reduce biases. Meanwhile, the overall ranking performance of code +search gets improved after debiasing. + +
+
+ comment: Accepted to Findings of EMNLP 2023. 11 pages +
+
+
+
+
+ + ♻ ☆ IRFL: Image Recognition of Figurative Language + + +
+ Figures of speech such as metaphors, similes, and idioms are integral parts +of human communication. They are ubiquitous in many forms of discourse, +allowing people to convey complex, abstract ideas and evoke emotion. As +figurative forms are often conveyed through multiple modalities (e.g., both +text and images), understanding multimodal figurative language is an important +AI challenge, weaving together profound vision, language, commonsense and +cultural knowledge. In this work, we develop the Image Recognition of +Figurative Language (IRFL) dataset. We leverage human annotation and an +automatic pipeline we created to generate a multimodal dataset, and introduce +two novel tasks as a benchmark for multimodal figurative language +understanding. We experimented with state-of-the-art vision and language models +and found that the best (22%) performed substantially worse than humans (97%). +We release our dataset, benchmark, and code, in hopes of driving the +development of models that can better understand figurative language. + +
+
+
+
+
+ + ♻ ☆ The Impact of Data Corruption on Named Entity Recognition for + Low-resourced Languages + + +
+ Data availability and quality are major challenges in natural language +processing for low-resourced languages. In particular, there is significantly +less data available than for higher-resourced languages. This data is also +often of low quality, rife with errors, invalid text or incorrect annotations. +Many prior works focus on dealing with these problems, either by generating +synthetic data, or filtering out low-quality parts of datasets. We instead +investigate these factors more deeply, by systematically measuring the effect +of data quantity and quality on the performance of pre-trained language models +in a low-resourced setting. Our results show that having fewer +completely-labelled sentences is significantly better than having more +sentences with missing labels; and that models can perform remarkably well with +only 10% of the training data. Importantly, these results are consistent across +ten low-resource languages, English, and four pre-trained models. + +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models: A Comprehensive Survey + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities across +a broad spectrum of tasks. They have attracted significant attention and been +deployed in numerous downstream applications. Nevertheless, akin to a +double-edged sword, LLMs also present potential risks. They could suffer from +private data leaks or yield inappropriate, harmful, or misleading content. +Additionally, the rapid progress of LLMs raises concerns about the potential +emergence of superintelligent systems without adequate safeguards. To +effectively capitalize on LLM capacities as well as ensure their safe and +beneficial development, it is critical to conduct a rigorous and comprehensive +evaluation of LLMs. + This survey endeavors to offer a panoramic perspective on the evaluation of +LLMs. We categorize the evaluation of LLMs into three major groups: knowledge +and capability evaluation, alignment evaluation and safety evaluation. In +addition to the comprehensive review on the evaluation methodologies and +benchmarks on these three aspects, we collate a compendium of evaluations +pertaining to LLMs' performance in specialized domains, and discuss the +construction of comprehensive evaluation platforms that cover LLM evaluations +on capabilities, alignment, safety, and applicability. + We hope that this comprehensive overview will stimulate further research +interests in the evaluation of LLMs, with the ultimate goal of making +evaluation serve as a cornerstone in guiding the responsible development of +LLMs. We envision that this will channel their evolution into a direction that +maximizes societal benefit while minimizing potential risks. A curated list of +related papers has been publicly available at +https://github.com/tjunlp-lab/Awesome-LLMs-Evaluation-Papers. + +
+
+ comment: 111 pages +
+
+
+
+
+ + ♻ ☆ Text2Cohort: Facilitating Intuitive Access to Biomedical Data with + Natural Language Cohort Discovery + + +
+ The Imaging Data Commons (IDC) is a cloud-based database that provides +researchers with open access to cancer imaging data, with the goal of +facilitating collaboration. However, cohort discovery within the IDC database +has a significant technical learning curve. Recently, large language models +(LLM) have demonstrated exceptional utility for natural language processing +tasks. We developed Text2Cohort, a LLM-powered toolkit to facilitate +user-friendly natural language cohort discovery in the IDC. Our method +translates user input into IDC queries using grounding techniques and returns +the query's response. We evaluate Text2Cohort on 50 natural language inputs, +from information extraction to cohort discovery. Our toolkit successfully +generated responses with an 88% accuracy and 0.94 F1 score. We demonstrate that +Text2Cohort can enable researchers to discover and curate cohorts on IDC with +high levels of accuracy using natural language in a more intuitive and +user-friendly way. + +
+
+ comment: 5 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Trainable Noise Model as an XAI evaluation method: application on Sobol + for remote sensing image segmentation + + +
+ eXplainable Artificial Intelligence (XAI) has emerged as an essential +requirement when dealing with mission-critical applications, ensuring +transparency and interpretability of the employed black box AI models. The +significance of XAI spans various domains, from healthcare to finance, where +understanding the decision-making process of deep learning algorithms is +essential. Most AI-based computer vision models are often black boxes; hence, +providing explainability of deep neural networks in image processing is crucial +for their wide adoption and deployment in medical image analysis, autonomous +driving, and remote sensing applications. Recently, several XAI methods for +image classification tasks have been introduced. On the contrary, image +segmentation has received comparatively less attention in the context of +explainability, although it is a fundamental task in computer vision +applications, especially in remote sensing. Only some research proposes +gradient-based XAI algorithms for image segmentation. This paper adapts the +recent gradient-free Sobol XAI method for semantic segmentation. To measure the +performance of the Sobol method for segmentation, we propose a quantitative XAI +evaluation method based on a learnable noise model. The main objective of this +model is to induce noise on the explanation maps, where higher induced noise +signifies low accuracy and vice versa. A benchmark analysis is conducted to +evaluate and compare performance of three XAI methods, including Seg-Grad-CAM, +Seg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation +technique. This constitutes the first attempt to run and evaluate XAI methods +using high-resolution satellite images. + +
+
+
+
+
+ + ♻ ☆ OffMix-3L: A Novel Code-Mixed Dataset in Bangla-English-Hindi for + Offensive Language Identification + + +
+ Code-mixing is a well-studied linguistic phenomenon when two or more +languages are mixed in text or speech. Several works have been conducted on +building datasets and performing downstream NLP tasks on code-mixed data. +Although it is not uncommon to observe code-mixing of three or more languages, +most available datasets in this domain contain code-mixed data from only two +languages. In this paper, we introduce OffMix-3L, a novel offensive language +identification dataset containing code-mixed data from three different +languages. We experiment with several models on this dataset and observe that +BanglishBERT outperforms other transformer-based models and GPT-3.5. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2310.18023 +
+
+
+
+
+ + ♻ ☆ GRDD: A Dataset for Greek Dialectal NLP + + +
+ In this paper, we present a dataset for the computational study of a number +of Modern Greek dialects. It consists of raw text data from four dialects of +Modern Greek, Cretan, Pontic, Northern Greek and Cypriot Greek. The dataset is +of considerable size, albeit imbalanced, and presents the first attempt to +create large scale dialectal resources of this type for Modern Greek dialects. +We then use the dataset to perform dialect idefntification. We experiment with +traditional ML algorithms, as well as simple DL architectures. The results show +very good performance on the task, potentially revealing that the dialects in +question have distinct enough characteristics allowing even simple ML models to +perform well on the task. Error analysis is performed for the top performing +algorithms showing that in a number of cases the errors are due to insufficient +dataset cleaning. + +
+
+
+
+
+ + ♻ ☆ Semantic Parsing by Large Language Models for Intricate Updating + Strategies of Zero-Shot Dialogue State Tracking EMNLP 2023 + + +
+ Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring +and annotating task-oriented dialogues, which can be time-consuming and costly. +However, DST extends beyond simple slot-filling and requires effective updating +strategies for tracking dialogue state as conversations progress. In this +paper, we propose ParsingDST, a new In-Context Learning (ICL) method, to +introduce additional intricate updating strategies in zero-shot DST. Our +approach reformulates the DST task by leveraging powerful Large Language Models +(LLMs) and translating the original dialogue text to JSON through semantic +parsing as an intermediate state. We also design a novel framework that +includes more modules to ensure the effectiveness of updating strategies in the +text-to-JSON process. Experimental results demonstrate that our approach +outperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant +improvements in Joint Goal Accuracy (JGA) and slot accuracy compared to +existing ICL methods. Our code has been released. + +
+
+ comment: Accepted to the Findings of EMNLP 2023 (Short Paper) +
+
+
+
+
+ + ♻ ☆ Evaluating the Instruction-Following Robustness of Large Language Models + to Prompt Injection + + +
+ Large Language Models (LLMs) have demonstrated exceptional proficiency in +instruction-following, becoming increasingly crucial across various +applications. However, this capability brings with it the risk of prompt +injection attacks, where attackers inject instructions into LLMs' input to +elicit undesirable actions or content. Understanding the robustness of LLMs +against such attacks is vital for their safe implementation. In this work, we +establish a benchmark to evaluate the robustness of instruction-following LLMs +against prompt injection attacks. Our objective is to determine the extent to +which LLMs can be influenced by injected instructions and their ability to +differentiate between these injected and original target instructions. Through +extensive experiments with leading instruction-following LLMs, we uncover +significant vulnerabilities in their robustness to such attacks. Our results +indicate that some models are overly tuned to follow any embedded instructions +in the prompt, overly focusing on the latter parts of the prompt without fully +grasping the entire context. By contrast, models with a better grasp of the +context and instruction-following capabilities will potentially be more +susceptible to compromise by injected instructions. This underscores the need +to shift the focus from merely enhancing LLMs' instruction-following +capabilities to improving their overall comprehension of prompts and +discernment of instructions that are appropriate to follow. We hope our +in-depth analysis offers insights into the underlying causes of these +vulnerabilities, aiding in the development of future solutions. Code and data +are available at +https://github.com/Leezekun/instruction-following-robustness-eval + +
+
+ comment: The data and code can be found at + https://github.com/Leezekun/instruction-following-robustness-eval +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ Hide Your Model: A Parameter Transmission-free Federated Recommender + System + + +
+ With the growing concerns regarding user data privacy, Federated Recommender +System (FedRec) has garnered significant attention recently due to its +privacy-preserving capabilities. Existing FedRecs generally adhere to a +learning protocol in which a central server shares a global recommendation +model with clients, and participants achieve collaborative learning by +frequently communicating the model's public parameters. Nevertheless, this +learning framework has two drawbacks that limit its practical usability: (1) It +necessitates a global-sharing recommendation model; however, in real-world +scenarios, information related to the recommender model, including its +algorithm and parameters, constitutes the platforms' intellectual property. +Hence, service providers are unlikely to release such information actively. (2) +The communication costs of model parameter transmission are expensive since the +model parameters are usually high-dimensional matrices. With the model size +increasing, the communication burden will be the bottleneck for such +traditional FedRecs. + Given the above limitations, this paper introduces a novel parameter +transmission-free federated recommendation framework that balances the +protection between users' data privacy and platforms' model privacy, namely +PTF-FedRec. Specifically, participants in PTF-FedRec collaboratively exchange +knowledge by sharing their predictions within a privacy-preserving mechanism. +Through this way, the central server can learn a recommender model without +disclosing its model parameters or accessing clients' raw data, preserving both +the server's model privacy and users' data privacy. Besides, since clients and +the central server only need to communicate prediction scores which are just a +few real numbers, the overhead is significantly reduced compared to traditional +FedRecs. + +
+
+
+
+
+ + ☆ Word for Person: Zero-shot Composed Person Retrieval + + +
+ Searching for specific person has great security value and social benefits, +and it often involves a combination of visual and textual information. +Conventional person retrieval methods, whether image-based or text-based, +usually fall short in effectively harnessing both types of information, leading +to the loss of accuracy. In this paper, a whole new task called Composed Person +Retrieval (CPR) is proposed to jointly utilize both image and text information +for target person retrieval. However, the supervised CPR must depend on very +costly manual annotation dataset, while there are currently no available +resources. To mitigate this issue, we firstly introduce the Zero-shot Composed +Person Retrieval (ZS-CPR), which leverages existing domain-related data to +resolve the CPR problem without reliance on expensive annotations. Secondly, to +learn ZS-CPR model, we propose a two-stage learning framework, Word4Per, where +a lightweight Textual Inversion Network (TINet) and a text-based person +retrieval model based on fine-tuned Contrastive Language-Image Pre-training +(CLIP) network are learned without utilizing any CPR data. Thirdly, a finely +annotated Image-Text Composed Person Retrieval dataset (ITCPR) is built as the +benchmark to assess the performance of the proposed Word4Per framework. +Extensive experiments under both Rank-1 and mAP demonstrate the effectiveness +of Word4Per for the ZS-CPR task, surpassing the comparative methods by over +10%. The code and ITCPR dataset will be publicly available at +https://github.com/Delong-liu-bupt/Word4Per. + +
+
+
+
+
+ + ♻ ☆ Text2Cohort: Facilitating Intuitive Access to Biomedical Data with + Natural Language Cohort Discovery + + +
+ The Imaging Data Commons (IDC) is a cloud-based database that provides +researchers with open access to cancer imaging data, with the goal of +facilitating collaboration. However, cohort discovery within the IDC database +has a significant technical learning curve. Recently, large language models +(LLM) have demonstrated exceptional utility for natural language processing +tasks. We developed Text2Cohort, a LLM-powered toolkit to facilitate +user-friendly natural language cohort discovery in the IDC. Our method +translates user input into IDC queries using grounding techniques and returns +the query's response. We evaluate Text2Cohort on 50 natural language inputs, +from information extraction to cohort discovery. Our toolkit successfully +generated responses with an 88% accuracy and 0.94 F1 score. We demonstrate that +Text2Cohort can enable researchers to discover and curate cohorts on IDC with +high levels of accuracy using natural language in a more intuitive and +user-friendly way. + +
+
+ comment: 5 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ LFG: A Generative Network for Real-Time Recommendation + + +
+ Recommender systems are essential information technologies today, and +recommendation algorithms combined with deep learning have become a research +hotspot in this field. The recommendation model known as LFM (Latent Factor +Model), which captures latent features through matrix factorization and +gradient descent to fit user preferences, has given rise to various +recommendation algorithms that bring new improvements in recommendation +accuracy. However, collaborative filtering recommendation models based on LFM +lack flexibility and has shortcomings for real-time recommendations, as they +need to redo the matrix factorization and retrain using gradient descent when +new users arrive. In response to this, this paper innovatively proposes a +Latent Factor Generator (LFG) network, and set the movie recommendation as +research theme. The LFG dynamically generates user latent factors through deep +neural networks without the need for re-factorization or retrain. Experimental +results indicate that the LFG recommendation model outperforms traditional +matrix factorization algorithms in recommendation accuracy, providing an +effective solution to the challenges of real-time recommendations with LFM. + +
+
+ comment: 9 pages, 1 figure, 4 tables. Source code would be uploaded to github + soon +
+
+
+
+
+ + ♻ ☆ Intent Contrastive Learning with Cross Subsequences for Sequential + Recommendation WSDM2024 + + +
+ The user purchase behaviors are mainly influenced by their intentions (e.g., +buying clothes for decoration, buying brushes for painting, etc.). Modeling a +user's latent intention can significantly improve the performance of +recommendations. Previous works model users' intentions by considering the +predefined label in auxiliary information or introducing stochastic data +augmentation to learn purposes in the latent space. However, the auxiliary +information is sparse and not always available for recommender systems, and +introducing stochastic data augmentation may introduce noise and thus change +the intentions hidden in the sequence. Therefore, leveraging user intentions +for sequential recommendation (SR) can be challenging because they are +frequently varied and unobserved. In this paper, Intent contrastive learning +with Cross Subsequences for sequential Recommendation (ICSRec) is proposed to +model users' latent intentions. Specifically, ICSRec first segments a user's +sequential behaviors into multiple subsequences by using a dynamic sliding +operation and takes these subsequences into the encoder to generate the +representations for the user's intentions. To tackle the problem of no explicit +labels for purposes, ICSRec assumes different subsequences with the same target +item may represent the same intention and proposes a coarse-grain intent +contrastive learning to push these subsequences closer. Then, fine-grain intent +contrastive learning is mentioned to capture the fine-grain intentions of +subsequences in sequential behaviors. Extensive experiments conducted on four +real-world datasets demonstrate the superior performance of the proposed ICSRec +model compared with baseline methods. + +
+
+ comment: 10pages, 5figures, WSDM2024. arXiv admin note: text overlap with + arXiv:2304.07763 +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Weakly-Supervised Audio-Visual Segmentation + + +
+ Audio-visual segmentation is a challenging task that aims to predict +pixel-level masks for sound sources in a video. Previous work applied a +comprehensive manually designed architecture with countless pixel-wise accurate +masks as supervision. However, these pixel-level masks are expensive and not +available in all cases. In this work, we aim to simplify the supervision as the +instance-level annotation, i.e., weakly-supervised audio-visual segmentation. +We present a novel Weakly-Supervised Audio-Visual Segmentation framework, +namely WS-AVS, that can learn multi-scale audio-visual alignment with +multi-scale multiple-instance contrastive learning for audio-visual +segmentation. Extensive experiments on AVSBench demonstrate the effectiveness +of our WS-AVS in the weakly-supervised audio-visual segmentation of +single-source and multi-source scenarios. + +
+
+
+
+
+ + ☆ Incorporating granularity bias as the margin into contrastive loss for + video captioning + + +
+ Video captioning models easily suffer from long-tail distribution of phrases, +which makes captioning models prone to generate vague sentences instead of +accurate ones. However, existing debiasing strategies tend to export external +knowledge to build dependency trees of words or refine frequency distribution +by complex losses and extra input features, which lack interpretability and are +hard to train. To mitigate the impact of granularity bias on the model, we +introduced a statistical-based bias extractor. This extractor quantifies the +information content within sentences and videos, providing an estimate of the +likelihood that a video-sentence pair is affected by granularity bias. +Furthermore, with the growing trend of integrating contrastive learning methods +into video captioning tasks, we use a bidirectional triplet loss to get more +negative samples in a batch. Subsequently, we incorporate the margin score into +the contrastive learning loss, establishing distinct training objectives for +head and tail sentences. This approach facilitates the model's training +effectiveness on tail samples. Our simple yet effective loss, incorporating +Granularity bias, is referred to as the Margin-Contrastive Loss (GMC Loss). The +proposed model demonstrates state-of-the-art performance on MSRVTT with a CIDEr +of 57.17, and MSVD, where CIDEr reaches up to 138.68. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Vision-Language Instruction Tuning: A Review and Analysis + + +
+ Instruction tuning is a crucial supervised training phase in Large Language +Models (LLMs), aiming to enhance the LLM's ability to generalize instruction +execution and adapt to user preferences. With the increasing integration of +multi-modal data into LLMs, there is growing interest in Vision-Language +Instruction Tuning (VLIT), which presents more complex characteristics compared +to pure text instruction tuning. In this paper, we systematically review the +latest VLIT settings and corresponding datasets in multi-modal LLMs and provide +insights into the intrinsic motivations behind their design. For the first +time, we offer a detailed multi-perspective categorization for existing VLIT +datasets and identify the characteristics that high-quality VLIT data should +possess. By incorporating these characteristics as guiding principles into the +existing VLIT data construction process, we conduct extensive experiments and +verify their positive impact on the performance of tuned multi-modal LLMs. +Furthermore, we discuss the current challenges and future research directions +of VLIT, providing insights for the continuous development of this field. The +code and dataset related to this paper have been open-sourced at +https://github.com/palchenli/VL-Instruction-Tuning. + +
+
+ comment: 34 pages, 6 figures +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`